Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -8,12 +8,12 @@ import zipfile
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 def print_top_10_largest_files(zip_file):
    """Print the top 10 largest files in the given zip file."""
-    with zipfile.ZipFile(zip_file, 'r') as z:
+    with zipfile.ZipFile(zip_file, "r") as z:
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
        file_sizes.sort(key=lambda x: x[1], reverse=True)
        for f, size in file_sizes[:10]:
@@ -28,14 +28,18 @@ def check_wheel_size(directory):
                wheel_path = os.path.join(root, file_name)
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
-                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                    print(
-                          f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"Not allowed: Wheel {wheel_path} is larger "
-                          f"({VLLM_MAX_SIZE_MB} MB).")
+                        f"({wheel_size_mb:.2f} MB) than the limit "
+                        f"({VLLM_MAX_SIZE_MB} MB)."
+                    )
                    print_top_10_largest_files(wheel_path)
                    return 1
                else:
-                    print(f"Wheel {wheel_path} is within the allowed size "
+                    print(
-                          f"({wheel_size_mb:.2f} MB).")
+                        f"Wheel {wheel_path} is within the allowed size "
+                        f"({wheel_size_mb:.2f} MB)."
+                    )
    return 0
@@ -45,4 +49,4 @@ if __name__ == "__main__":
        sys.exit(1)
    directory = sys.argv[1]
    sys.exit(check_wheel_size(directory))
\ No newline at end of file
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -22,5 +22,5 @@ with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(wheel=filename,
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
-                        wheel_html_escaped=filename.replace("+", "%2B")))
+    )
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.335
+  - name: "exact_match,flexible-extract"
+    value: 0.323
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.54
+  - name: "exact_match,flexible-extract"
+    value: 0.59
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.47
+  - name: "exact_match,flexible-extract"
+    value: 0.64
+limit: 1319
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
-Meta-Llama-3-8B-Instruct.yaml
+Qwen2.5-1.5B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
-Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
-Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+import pytest
+def pytest_addoption(parser):
+    parser.addoption(
+        "--config-list-file",
+        action="store",
+        help="Path to the file listing model config YAMLs (one per line)",
+    )
+    parser.addoption(
+        "--tp-size",
+        action="store",
+        default="1",
+        help="Tensor parallel size to use for evaluation",
+    )
+@pytest.fixture(scope="session")
+def config_list_file(pytestconfig, config_dir):
+    rel_path = pytestconfig.getoption("--config-list-file")
+    return config_dir / rel_path
+@pytest.fixture(scope="session")
+def tp_size(pytestconfig):
+    return pytestconfig.getoption("--tp-size")
+def pytest_generate_tests(metafunc):
+    if "config_filename" in metafunc.fixturenames:
+        rel_path = metafunc.config.getoption("--config-list-file")
+        config_list_file = Path(rel_path).resolve()
+        config_dir = config_list_file.parent
+        with open(config_list_file, encoding="utf-8") as f:
+            configs = [
+                config_dir / line.strip()
+                for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        metafunc.parametrize("config_filename", configs)
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
-#!/bin/bash
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-SUCCESS=0
-while getopts "c:t:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-done
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -3,67 +3,52 @@
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+pytest -s -v test_lm_eval_correctness.py \
-* export LM_EVAL_TP_SIZE=4 
+    --config-list-file=configs/models-small.txt \
-* pytest -s test_lm_eval_correctness.py
+    --tp-size=1
 """
-import os
-from pathlib import Path
 import lm_eval
-import numpy
+import numpy as np
-import pytest
 import yaml
 RTOL = 0.08
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
+def launch_lm_eval(eval_config, tp_size):
+    trust_remote_code = eval_config.get("trust_remote_code", False)
+    model_args = (
+        f"pretrained={eval_config['model_name']},"
+        f"tensor_parallel_size={tp_size},"
+        f"enforce_eager=true,"
+        f"add_bos_token=true,"
+        f"trust_remote_code={trust_remote_code}"
+    )
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto")
+        batch_size="auto",
+    )
    return results
-def test_lm_eval_correctness():
+def test_lm_eval_correctness_param(config_filename, tp_size):
-    eval_config = yaml.safe_load(
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-    if eval_config[
-            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
-        pytest.skip("FBGEMM is currently failing on main.")
-    # Launch eval requests.
+    results = launch_lm_eval(eval_config, tp_size)
-    results = launch_lm_eval(eval_config)
-    # Confirm scores match ground truth.
    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
+            print(
-                  f'ground_truth={ground_truth} | measured={measured_value}')
+                f"{task['name']} | {metric['name']}: "
-            success = success and numpy.isclose(
+                f"ground_truth={ground_truth} | measured={measured_value}"
-                ground_truth, measured_value, rtol=RTOL)
+            )
+            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
-    # Assert at the end, print all scores even on failure for debugging.
    assert success
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -65,18 +65,18 @@ def read_markdown(file):
 def results_to_json(latency, throughput, serving):
-    return json.dumps({
+    return json.dumps(
-        'latency': latency.to_dict(),
+        {
-        'throughput': throughput.to_dict(),
+            "latency": latency.to_dict(),
-        'serving': serving.to_dict()
+            "throughput": throughput.to_dict(),
-    })
+            "serving": serving.to_dict(),
+        }
+    )
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
@@ -120,7 +120,8 @@ if __name__ == "__main__":
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
@@ -153,26 +154,27 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
-    raw_results_json = results_to_json(latency_results, throughput_results,
+    raw_results_json = results_to_json(
-                                       serving_results)
+        latency_results, throughput_results, serving_results
+    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        latency_results = latency_results[list(
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
-            latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
-                columns=latency_column_mapping)
+        )
    if not serving_results.empty:
-        serving_results = serving_results[list(
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
-                columns=serving_column_mapping)
+        )
    if not throughput_results.empty:
-        throughput_results = throughput_results[list(
+        throughput_results = throughput_results[
-            throughput_results_column_mapping.keys())].rename(
+            list(throughput_results_column_mapping.keys())
-                columns=throughput_results_column_mapping)
+        ].rename(columns=throughput_results_column_mapping)
-    processed_results_json = results_to_json(latency_results,
+    processed_results_json = results_to_json(
-                                             throughput_results,
+        latency_results, throughput_results, serving_results
-                                             serving_results)
+    )
    for df in [latency_results, serving_results, throughput_results]:
        if df.empty:
@@ -184,38 +186,39 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
+            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+        )
    # get markdown tables
-    latency_md_table = tabulate(latency_results,
+    latency_md_table = tabulate(
-                                headers='keys',
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
-                                tablefmt='pipe',
+    )
-                                showindex=False)
+    serving_md_table = tabulate(
-    serving_md_table = tabulate(serving_results,
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
-                                headers='keys',
+    )
-                                tablefmt='pipe',
+    throughput_md_table = tabulate(
-                                showindex=False)
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
-    throughput_md_table = tabulate(throughput_results,
+    )
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
+        results = read_markdown(
-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
+            "../.buildkite/nightly-benchmarks/"
-                                "performance-benchmarks-descriptions.md")
+            + "performance-benchmarks-descriptions.md"
+        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
+            benchmarking_results_in_json_string=processed_results_json,
+        )
        f.write(results)
    # document benchmarking results in json
    with open(results_folder / "benchmark_results.json", "w") as f:
+        results = (
-        results = latency_results.to_dict(
+            latency_results.to_dict(orient="records")
-            orient='records') + throughput_results.to_dict(
+            + throughput_results.to_dict(orient="records")
-                orient='records') + serving_results.to_dict(orient='records')
+            + serving_results.to_dict(orient="records")
+        )
        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -14,15 +14,12 @@ def main(model, cachedir):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Download and save Hugging Face tokenizer")
+        description="Download and save Hugging Face tokenizer"
-    parser.add_argument("--model",
+    )
-                        type=str,
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-                        required=True,
+    parser.add_argument(
-                        help="Name of the model")
+        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
-    parser.add_argument("--cachedir",
+    )
-                        type=str,
-                        required=True,
-                        help="Directory to save the tokenizer")
    args = parser.parse_args()
    main(args.model, args.cachedir)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -11,33 +11,33 @@ from tabulate import tabulate
 def parse_arguments():
    parser = argparse.ArgumentParser(
-        description=
+        description="Parse command line arguments for summary-nightly-results script."
-        'Parse command line arguments for summary-nightly-results script.')
+    )
-    parser.add_argument('--results-folder',
+    parser.add_argument(
-                        type=str,
+        "--results-folder",
-                        required=True,
+        type=str,
-                        help='The folder where the results are stored.')
+        required=True,
-    parser.add_argument('--description',
+        help="The folder where the results are stored.",
-                        type=str,
+    )
-                        required=True,
+    parser.add_argument(
-                        help='Description of the results.')
+        "--description", type=str, required=True, help="Description of the results."
+    )
    args = parser.parse_args()
    return args
 def get_perf(df, method, model, metric):
    means = []
    for qps in [2, 4, 8, 16, "inf"]:
-        target = df['Test name'].str.contains(model)
+        target = df["Test name"].str.contains(model)
-        target = target & df['Engine'].str.contains(method)
+        target = target & df["Engine"].str.contains(method)
-        target = target & df['Test name'].str.contains("qps_" + str(qps))
+        target = target & df["Test name"].str.contains("qps_" + str(qps))
        filtered_df = df[target]
        if filtered_df.empty:
-            means.append(0.)
+            means.append(0.0)
        else:
            means.append(filtered_df[metric].values[0])
@@ -45,7 +45,6 @@ def get_perf(df, method, model, metric):
 def get_perf_w_std(df, method, model, metric):
    if metric in ["TTFT", "ITL"]:
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
        mean = mean.tolist()
@@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric):
    else:
        assert metric == "Tput"
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
-            df, method, model, "Output Tput (tok/s)")
+            df, method, model, "Output Tput (tok/s)"
+        )
        mean = mean.tolist()
        std = None
@@ -80,18 +80,17 @@ def main(args):
    # generate markdown table
    df = pd.DataFrame.from_dict(results)
-    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
    with open(args.description) as f:
        description = f.read()
-    description = description.format(
+    description = description.format(nightly_results_benchmarking_table=md_table)
-        nightly_results_benchmarking_table=md_table)
    with open("nightly_results.md", "w") as f:
        f.write(description)
-if __name__ == '__main__':
+if __name__ == "__main__":
    args = parse_arguments()
    main(args)
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -34,10 +34,8 @@ serving_column_mapping = {
 }
 if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
@@ -56,17 +54,16 @@ if __name__ == "__main__":
    serving_results = pd.DataFrame.from_dict(serving_results)
    if not serving_results.empty:
-        serving_results = serving_results[list(
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
-            serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
-                columns=serving_column_mapping)
+        )
-    serving_md_table_with_headers = tabulate(serving_results,
+    serving_md_table_with_headers = tabulate(
-                                             headers='keys',
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
-                                             tablefmt='pipe',
+    )
-                                             showindex=False)
    # remove the first line of header
-    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_lines = serving_md_table_with_headers.split("\n")
-    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@@ -76,10 +73,9 @@ if __name__ == "__main__":
        # document results with header.
        # for those who wants to reproduce our benchmark.
        f.write(serving_md_table_with_headers)
-        f.write('\n')
+        f.write("\n")
    # document benchmarking results in json
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+        results = serving_results.to_dict(orient="records")
-        results = serving_results.to_dict(orient='records')
        f.write(json.dumps(results))
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+[tool.ruff]
+line-length = 88
+exclude = [
+    # External file, leaving license intact
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
+]
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+[tool.ruff.format]
+docstring-code-format = true
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
 steps:
-  - label: "Build wheel - CUDA 12.4"
+  - label: "Build wheel - CUDA 12.8"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA 12.6"
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build and publish TPU release image"
@@ -57,6 +57,8 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
+      - "yes | docker system prune -a"
+      - "git fetch --all"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -3,6 +3,9 @@
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
+# Export Python path
+export PYTHONPATH=".."
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
@@ -74,38 +77,69 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
+if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
+fi
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
+  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
+fi
+if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
+fi
 #ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
+if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
-  --ignore=kernels/test_attention_selector.py \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
-  --ignore=kernels/test_blocksparse_attention.py \
+  --ignore=kernels/core/test_permute_cols.py"
-  --ignore=kernels/test_causal_conv1d.py \
+fi
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
+if [[ $commands == *" kernels/attention"* ]]; then
-  --ignore=kernels/test_flash_attn.py \
+  commands="${commands} \
-  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/attention/stest_attention_selector.py \
-  --ignore=kernels/test_int8_quant.py \
+  --ignore=kernels/attention/test_blocksparse_attention.py \
-  --ignore=kernels/test_machete_gemm.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_mamba_ssm.py \
+  --ignore=kernels/attention/test_attention_selector.py \
-  --ignore=kernels/test_marlin_gemm.py \
+  --ignore=kernels/attention/test_flash_attn.py \
-  --ignore=kernels/test_moe.py \
+  --ignore=kernels/attention/test_flashinfer.py \
-  --ignore=kernels/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
-  --ignore=kernels/test_sampler.py \
+  --ignore=kernels/attention/test_mha_attn.py \
-  --ignore=kernels/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py \
+  --ignore=kernels/attention/test_attention.py"
-  --ignore=kernels/test_aqlm.py \
+fi
-  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/test_mha_attn.py \
+if [[ $commands == *" kernels/quantization"* ]]; then
-  --ignore=kernels/test_block_fp8.py \
+  commands="${commands} \
-  --ignore=kernels/test_cutlass_moe.py \
+  --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/test_mamba_ssm_ssd.py \
+  --ignore=kernels/quantization/test_aqlm.py \
-  --ignore=kernels/test_attention.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
-  --ignore=kernels/test_block_int8.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
-  --ignore=kernels/test_fused_quant_layernorm.py \
+  --ignore=kernels/quantization/test_block_int8.py \
-  --ignore=kernels/test_int8_kernel.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
-  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-  --ignore=kernels/test_permute_cols.py"
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
@@ -147,6 +181,8 @@ fi
 PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
  # assign job count as the number of shards used   
@@ -167,6 +203,7 @@ if [[ $commands == *"--shard-id="* ]]; then
        -e AWS_SECRET_ACCESS_KEY \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
        --name "${container_name}_${GPU}" \
        "${image_name}" \
        /bin/bash -c "${commands_gpu}" \
@@ -197,6 +234,7 @@ else
          -e AWS_SECRET_ACCESS_KEY \
          -v "${HF_CACHE}:${HF_MOUNT}" \
          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
          --name "${container_name}" \
          "${image_name}" \
          /bin/bash -c "${commands}"

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -32,9 +32,12 @@ function cpu_tests() {
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
 }
 # All of CPU tests are expected to be finished less than 40 mins.

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
 #!/bin/bash
-set -xue
+set -xu
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
@@ -24,31 +24,80 @@ docker run --privileged --net host --shm-size=16G -it \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
    && tpu-info \
-    && echo TEST_0 \
+    && { \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
+        echo TEST_0: Running test_perf.py; \
-    && echo TEST_1 \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
+        echo TEST_0_EXIT_CODE: \$?; \
-    && echo TEST_2 \
+    } & \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    { \
-    && echo TEST_3 \
+        echo TEST_1: Running test_compilation.py; \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
-    && echo TEST_4 \
+        echo TEST_1_EXIT_CODE: \$?; \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    } & \
-    && echo TEST_5 \
+    { \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+        echo TEST_2: Running test_basic.py; \
-    && echo TEST_6 \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+        echo TEST_2_EXIT_CODE: \$?; \
-    && echo TEST_7 \
+    } & \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
+    { \
-    && echo TEST_8 \
+        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
+        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-    && echo TEST_9 \
+        echo TEST_3_EXIT_CODE: \$?; \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
+    } & \
-    && echo TEST_10 \
+    { \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
+        echo TEST_4: Running test_quantization_accuracy.py; \
-    && echo TEST_11 \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
-    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
+        echo TEST_4_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_5: Running examples/offline_inference/tpu.py; \
+        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
+        echo TEST_5_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_6: Running test_tpu_model_runner.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
+        echo TEST_6_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_7: Running test_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
+        echo TEST_7_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_8: Running test_topk_topp_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
+        echo TEST_8_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_9: Running test_multimodal.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
+        echo TEST_9_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_10: Running test_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
+        echo TEST_10_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_11: Running test_struct_output_generate.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
+        echo TEST_11_EXIT_CODE: \$?; \
+    } & \
+    { \
+        echo TEST_12: Running test_moe_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
+        echo TEST_12_EXIT_CODE: \$?; \
+    } & \
+    # Disable the TPU LoRA tests until the feature is activated
+    # & { \
+    #     echo TEST_13: Running test_moe_pallas.py; \
+    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
+    #     echo TEST_13_EXIT_CODE: \$?; \
+    # } & \
+    wait \
+    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
+"
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
+elif [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
+    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
+elif [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
+    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"