Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,7 +58,7 @@ schemathesis==3.39.15
    # OpenAI schema test
 # Evaluation and benchmarking
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval[api]>=0.4.9.2
 jiwer==4.0.0
 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
@@ -74,17 +74,21 @@ torchgeo==0.7.0
 # MTEB Benchmark Test
 mteb==2.1.2
-# Data processing
-xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
-# Test async scheduling
 # Utilities
 num2words==0.5.14
    # via lm-eval
 pqdm==0.2.0
    # via lm-eval
+# Required for fastsafetensors test
+fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
 # Required for suffix decoding test
 arctic-inference == 0.1.1
 # Required for Nemotron test
 open-clip-torch==2.32.0
+# Required for isaac Multi-Modal generation test
+perceptron==0.1.4
+# Required for the multi-modal models test
+timm==1.0.17
+# Required for plugins test
+albumentations==1.4.6
\ No newline at end of file
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,7 +15,7 @@ setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.3
 # conch-triton-kernels==1.2.1
 timm>=1.0.17
-fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
+grpcio-tools>=1.76.0
 numa
 pytrie
@@ -23,10 +23,10 @@ setuptools_scm>=8
 cmake==3.29
 quart
 fastrlock==0.8.3
-cupy==12.3.0
+# cupy==12.3.0
-torch >= 2.7.1
+torch == 2.9.0
-triton == 3.1
+triton == 3.3
 flash_attn == 2.6.1
 flash_mla == 1.0.0
 lightop == 0.6.0

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -9,6 +9,7 @@ pytest-timeout
 pytest-cov
 # testing utils
+albumentations # required for Nemotron Parse in test_common.py
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl
@@ -19,23 +20,22 @@ vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
-sentence-transformers # required for embedding tests
+sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
-timm >=1.0.17 # required for internvl and gemma3n-mm test
+timm==1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.0
+torch==2.9.1
-torchaudio==2.9.0
+torchaudio==2.9.1
-torchvision==0.24.0
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.5 # required for voxtral test
+mistral_common[image,audio] >= 1.8.8 # required for voxtral test
 num2words # required for smolvlm test
-open_clip_torch==2.32.0 # Required for nemotron_vl test
+open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-# TODO: Use lm-eval[api]==0.4.10 once released
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
@@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
 gpt-oss >= 0.0.7; python_version > '3.11'
+perceptron # required for isaac test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,7 +27,9 @@ aiosignal==1.4.0
 albucore==0.0.16
    # via terratorch
 albumentations==1.4.6
-    # via terratorch
+    # via
+    #   -r requirements/test.in
+    #   terratorch
 alembic==1.16.4
    # via mlflow
 annotated-types==0.7.0
@@ -135,6 +137,7 @@ cloudpickle==3.1.1
    # via mlflow-skinny
 colorama==0.4.6
    # via
+    #   perceptron
    #   sacrebleu
    #   schemathesis
    #   tqdm-multiprocess
@@ -294,7 +297,7 @@ graphql-relay==3.2.0
    # via graphene
 greenlet==3.2.3
    # via sqlalchemy
-grpcio==1.71.0
+grpcio==1.76.0
    # via ray
 gunicorn==23.0.0
    # via mlflow
@@ -302,6 +305,8 @@ h11==0.14.0
    # via
    #   httpcore
    #   uvicorn
+h2==4.3.0
+    # via httpx
 h5py==3.13.0
    # via terratorch
 harfile==0.3.0
@@ -310,6 +315,8 @@ hf-xet==1.1.7
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
+hpack==4.1.0
+    # via h2
 html2text==2025.4.15
    # via gpt-oss
 httpcore==1.0.6
@@ -317,6 +324,7 @@ httpcore==1.0.6
 httpx==0.27.2
    # via
    #   -r requirements/test.in
+    #   perceptron
    #   schemathesis
 huggingface-hub==0.34.3
    # via
@@ -338,6 +346,8 @@ hydra-core==1.3.2
    # via
    #   lightly
    #   lightning
+hyperframe==6.1.0
+    # via h2
 hypothesis==6.131.0
    # via
    #   hypothesis-graphql
@@ -441,7 +451,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval==0.4.9.2
    # via -r requirements/test.in
 lxml==5.3.0
    # via
@@ -474,7 +484,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.8.5
+mistral-common==1.8.8
    # via -r requirements/test.in
 mlflow==2.22.0
    # via terratorch
@@ -549,6 +559,7 @@ numpy==1.26.4
    #   pandas
    #   patsy
    #   peft
+    #   perceptron
    #   pycocotools
    #   pyogrio
    #   rasterio
@@ -702,6 +713,8 @@ peft==0.16.0
    # via
    #   -r requirements/test.in
    #   lm-eval
+perceptron==0.1.4
+    # via -r requirements/test.in
 pillow==10.4.0
    # via
    #   genai-perf
@@ -709,9 +722,9 @@ pillow==10.4.0
    #   lightly-utils
    #   matplotlib
    #   mistral-common
+    #   perceptron
    #   scikit-image
    #   segmentation-models-pytorch
-    #   sentence-transformers
    #   torchgeo
    #   torchvision
 platformdirs==4.3.6
@@ -745,7 +758,7 @@ propcache==0.2.0
    #   yarl
 proto-plus==1.26.1
    # via google-api-core
-protobuf==5.28.3
+protobuf==6.33.2
    # via
    #   google-api-core
    #   googleapis-common-protos
@@ -952,6 +965,7 @@ rich==13.9.4
    #   genai-perf
    #   lightning
    #   mteb
+    #   perceptron
    #   typer
 rioxarray==0.19.0
    # via terratorch
@@ -1010,7 +1024,7 @@ segmentation-models-pytorch==0.4.0
    # via
    #   terratorch
    #   torchgeo
-sentence-transformers==3.2.1
+sentence-transformers==5.2.0
    # via
    #   -r requirements/test.in
    #   mteb
@@ -1024,7 +1038,9 @@ shapely==2.1.1
    #   geopandas
    #   torchgeo
 shellingham==1.5.4
-    # via typer
+    # via
+    #   perceptron
+    #   typer
 six==1.16.0
    # via
    #   junit-xml
@@ -1123,7 +1139,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.9.0+cu129
+torch==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -1152,7 +1168,7 @@ torch==2.9.0+cu129
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.9.0+cu129
+torchaudio==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1165,7 +1181,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.24.0+cu129
+torchvision==0.24.1+cu129
    # via
    #   -r requirements/test.in
    #   lightly
@@ -1206,7 +1222,7 @@ transformers==4.57.3
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.5.0
+triton==3.5.1
    # via torch
 tritonclient==2.51.0
    # via
@@ -1218,7 +1234,9 @@ typepy==1.3.2
    #   pytablewriter
    #   tabledata
 typer==0.15.2
-    # via fastsafetensors
+    # via
+    #   fastsafetensors
+    #   perceptron
 types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2
@@ -1231,6 +1249,7 @@ typing-extensions==4.15.0
    #   chz
    #   fastapi
    #   graphene
+    #   grpcio
    #   huggingface-hub
    #   librosa
    #   lightning
@@ -1246,6 +1265,7 @@ typing-extensions==4.15.0
    #   pydantic-core
    #   pydantic-extra-types
    #   pytorch-lightning
+    #   sentence-transformers
    #   sqlalchemy
    #   torch
    #   torchgeo

--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,8 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
+from setuptools.command.build_py import build_py
+from setuptools.command.develop import develop
 # from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
@@ -62,15 +64,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
        sys.platform,
    )
    VLLM_TARGET_DEVICE = "empty"
-elif (
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
-    sys.platform.startswith("linux")
+    if torch.version.hip is not None:
-    and torch.version.cuda is None
+        VLLM_TARGET_DEVICE = "rocm"
-    and os.getenv("VLLM_TARGET_DEVICE") is None
+        logger.info("Auto-detected ROCm")
-    and torch.version.hip is None
+    elif torch.version.cuda is not None:
-):
+        VLLM_TARGET_DEVICE = "cuda"
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
+        logger.info("Auto-detected CUDA")
-    # fallback to cpu
+    else:
-    VLLM_TARGET_DEVICE = "cpu"
+        VLLM_TARGET_DEVICE = "cpu"
 def is_sccache_available() -> bool:
@@ -91,6 +93,81 @@ def is_freethreaded():
    return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
+def compile_grpc_protos():
+    """Compile gRPC protobuf definitions during build.
+    This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
+    the vllm_engine.proto definition.
+    """
+    try:
+        from grpc_tools import protoc
+    except ImportError:
+        logger.warning(
+            "grpcio-tools not installed, skipping gRPC proto compilation. "
+            "gRPC server functionality will not be available."
+        )
+        return False
+    proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto"
+    if not proto_file.exists():
+        logger.warning("Proto file not found at %s, skipping compilation", proto_file)
+        return False
+    logger.info("Compiling gRPC protobuf: %s", proto_file)
+    result = protoc.main(
+        [
+            "grpc_tools.protoc",
+            f"--proto_path={ROOT_DIR}",
+            f"--python_out={ROOT_DIR}",
+            f"--grpc_python_out={ROOT_DIR}",
+            f"--pyi_out={ROOT_DIR}",
+            str(proto_file),
+        ]
+    )
+    if result != 0:
+        logger.error("protoc failed with exit code %s", result)
+        return False
+    # Add SPDX headers and mypy ignore to generated files
+    spdx_header = (
+        "# SPDX-License-Identifier: Apache-2.0\n"
+        "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
+        "# mypy: ignore-errors\n"
+    )
+    grpc_dir = ROOT_DIR / "vllm" / "grpc"
+    for generated_file in [
+        grpc_dir / "vllm_engine_pb2.py",
+        grpc_dir / "vllm_engine_pb2_grpc.py",
+        grpc_dir / "vllm_engine_pb2.pyi",
+    ]:
+        if generated_file.exists():
+            content = generated_file.read_text()
+            if not content.startswith("# SPDX-License-Identifier"):
+                generated_file.write_text(spdx_header + content)
+    logger.info("gRPC protobuf compilation successful")
+    return True
+class BuildPyAndGenerateGrpc(build_py):
+    """Build Python modules and generate gRPC stubs from proto files."""
+    def run(self):
+        compile_grpc_protos()
+        super().run()
+class DevelopAndGenerateGrpc(develop):
+    """Develop mode that also generates gRPC stubs from proto files."""
+    def run(self):
+        compile_grpc_protos()
+        super().run()
 class CMakeExtension(Extension):
    def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
        super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
@@ -120,20 +197,26 @@ class cmake_build_ext(build_ext):
                num_jobs = os.cpu_count()
        nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
+        if _is_cuda() and CUDA_HOME is not None:
-            # `nvcc_threads` is either the value of the NVCC_THREADS
+            try:
-            # environment variable (if defined) or 1.
+                nvcc_version = get_nvcc_cuda_version()
-            # when it is set, we reduce `num_jobs` to avoid
+                if nvcc_version >= Version("11.2"):
-            # overloading the system.
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
-            nvcc_threads = envs.NVCC_THREADS
+                    # environment variable (if defined) or 1.
-            if nvcc_threads is not None:
+                    # when it is set, we reduce `num_jobs` to avoid
-                nvcc_threads = int(nvcc_threads)
+                    # overloading the system.
-                logger.info(
+                    nvcc_threads = envs.NVCC_THREADS
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
+                    if nvcc_threads is not None:
-                )
+                        nvcc_threads = int(nvcc_threads)
-            else:
+                        logger.info(
-                nvcc_threads = 1
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
-            num_jobs = max(1, num_jobs // nvcc_threads)
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)
        return num_jobs, nvcc_threads
@@ -211,9 +294,9 @@ class cmake_build_ext(build_ext):
            # Default build tool to whatever cmake picks.
            build_tool = []
        # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
        other_cmake_args = os.environ.get("CMAKE_ARGS")
@@ -351,6 +434,89 @@ class precompiled_wheel_utils:
            wheels = json.loads(resp.read().decode("utf-8"))
        return wheels, repo_url
+    @staticmethod
+    def is_rocm_system() -> bool:
+        """Detect ROCm without relying on torch (for build environment)."""
+        if os.getenv("ROCM_PATH"):
+            return True
+        if os.path.isdir("/opt/rocm"):
+            return True
+        if which("rocminfo") is not None:
+            return True
+        try:
+            import torch
+            return torch.version.hip is not None
+        except ImportError:
+            return False
+    @staticmethod
+    def find_local_rocm_wheel() -> str | None:
+        """Search for a local vllm wheel in common locations."""
+        import glob
+        for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
+            wheels = glob.glob(pattern)
+            if wheels:
+                return sorted(wheels)[-1]
+        return None
+    @staticmethod
+    def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
+        """Fetch the latest wheel URL from a PyPI-style simple index."""
+        import platform
+        from html.parser import HTMLParser
+        from urllib.parse import urljoin
+        from urllib.request import urlopen
+        arch = platform.machine()
+        class WheelLinkParser(HTMLParser):
+            def __init__(self):
+                super().__init__()
+                self.wheels = []
+            def handle_starttag(self, tag, attrs):
+                if tag == "a":
+                    for name, value in attrs:
+                        if name == "href" and value.endswith(".whl"):
+                            self.wheels.append(value)
+        simple_url = f"{index_url.rstrip('/')}/{package}/"
+        print(f"Fetching wheel list from {simple_url}")
+        with urlopen(simple_url) as resp:
+            html = resp.read().decode("utf-8")
+        parser = WheelLinkParser()
+        parser.feed(html)
+        for wheel in reversed(parser.wheels):
+            if arch in wheel:
+                if wheel.startswith("http"):
+                    return wheel
+                return urljoin(simple_url, wheel)
+        raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
+    @staticmethod
+    def determine_wheel_url_rocm() -> tuple[str, str | None]:
+        """Determine the precompiled wheel for ROCm."""
+        # Search for local wheel first
+        local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
+        if local_wheel is not None:
+            print(f"Found local ROCm wheel: {local_wheel}")
+            return local_wheel, None
+        # Fall back to AMD's PyPI index
+        index_url = os.getenv(
+            "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
+        )
+        print(f"Fetching ROCm precompiled wheel from {index_url}")
+        wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
+        download_filename = wheel_url.split("/")[-1].split("#")[0]
+        print(f"Using ROCm precompiled wheel: {wheel_url}")
+        return wheel_url, download_filename
    @staticmethod
    def determine_wheel_url() -> tuple[str, str | None]:
        """
@@ -371,6 +537,11 @@ class precompiled_wheel_utils:
            print(f"Using user-specified precompiled wheel location: {wheel_location}")
            return wheel_location, None
        else:
+            # ROCm: use local wheel or AMD's PyPI index
+            # TODO: When we have ROCm nightly wheels, we can update this logic.
+            if precompiled_wheel_utils.is_rocm_system():
+                return precompiled_wheel_utils.determine_wheel_url_rocm()
            import platform
            arch = platform.machine()
@@ -477,6 +648,8 @@ class precompiled_wheel_utils:
                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                    "vllm/cumem_allocator.abi3.so",
+                    # ROCm-specific libraries
+                    "vllm/_rocm_C.abi3.so",
                ]
                flash_attn_regex = re.compile(
@@ -614,6 +787,8 @@ def get_rocm_version():
    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
    try:
+        if ROCM_HOME is None:
+            return None
        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
        if not librocm_core_file.is_file():
            return None
@@ -690,9 +865,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    new_version_content = f"""
 try:
-    __version__ = "0.13.0"
+    __version__ = "0.14.0"
-    __version_tuple__ = (0, 13, 0)
+    __version_tuple__ = (0, 14, 0)
-    __hcu_version__ = f'0.13.0+{version}' 
+    __hcu_version__ = f'0.14.0+{version}' 
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -863,7 +1038,9 @@ if _is_cuda() or _is_hip():
 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
        # FA3 requires CUDA 12.3 or later
        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
        # Optional since this doesn't get built (produce an .so file) when
@@ -882,9 +1059,10 @@ if skip_vllm_build:
            "py.typed",
            "model_executor/layers/fused_moe/configs/*.json",
            "model_executor/layers/quantization/utils/configs/*.json",
-            "perf/*.py",
+            "entrypoints/serve/instrumentator/static/*.js",
-            "attention/backends/configs/*.json",
+            "entrypoints/serve/instrumentator/static/*.css",
            "model_executor/layers/quantization/configs/awq/*.json",
+            "attention/backends/configs/*.json",
            "_C.abi3.so",
            "_moe_C.abi3.so",
        ]
@@ -895,7 +1073,8 @@ else:
            "py.typed",
            "model_executor/layers/fused_moe/configs/*.json",
            "model_executor/layers/quantization/utils/configs/*.json",
-            "perf/*.py",
+            "entrypoints/serve/instrumentator/static/*.js",
+            "entrypoints/serve/instrumentator/static/*.css",
            "attention/backends/configs/*.json",
            "model_executor/layers/quantization/configs/awq/*.json",
        ]
@@ -915,12 +1094,17 @@ if _no_device() or skip_vllm_build:
    ext_modules = []
 if not ext_modules:
-    cmdclass = {}
+    cmdclass = {
+        "build_py": BuildPyAndGenerateGrpc,
+        "develop": DevelopAndGenerateGrpc,
+    }
 else:
    cmdclass = {
        "build_ext": precompiled_build_ext
        if envs.VLLM_USE_PRECOMPILED
-        else cmake_build_ext
+        else cmake_build_ext,
+        "build_py": BuildPyAndGenerateGrpc,
+        "develop": DevelopAndGenerateGrpc,
    }
 setup(
@@ -929,12 +1113,13 @@ setup(
    ext_modules=ext_modules,
    install_requires=get_requirements(),
    extras_require={
-        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",
+            "scipy",
            "soundfile",
            "mistral_common[audio]",
        ],  # Required for audio processing

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -72,7 +72,6 @@ def _fix_prompt_embed_outputs(
 @pytest.mark.parametrize("model_executor", ["uni", "mp"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
-    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@@ -82,82 +81,80 @@ def test_models(
    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = (
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
+    example_prompts = [prompt]
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        if enable_prompt_embeds:
+            with torch.no_grad():
+                prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
-    with monkeypatch.context() as m:
+    if not current_platform.is_rocm():
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+        with VllmRunner(
+            model,
-        # 5042 tokens for gemma2
+            max_model_len=8192,
-        # gemma2 has alternating sliding window size of 4096
+            enforce_eager=enforce_eager,
-        # we need a prompt with more than 4096 tokens to test the sliding window
+            enable_prompt_embeds=enable_prompt_embeds,
-        prompt = (
+            gpu_memory_utilization=0.7,
-            "The following numbers of the sequence "
+            async_scheduling=async_scheduling,
-            + ", ".join(str(i) for i in range(1024))
+            distributed_executor_backend=model_executor,
-            + " are:"
+            attention_config={"backend": backend},
-        )
+        ) as vllm_model:
-        example_prompts = [prompt]
-        with hf_runner(model) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
            if enable_prompt_embeds:
-                with torch.no_grad():
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts
-        if not current_platform.is_rocm():
+                )
-            with VllmRunner(
+            else:
-                model,
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-                max_model_len=8192,
+    else:
-                enforce_eager=enforce_eager,
+        with VllmRunner(
-                enable_prompt_embeds=enable_prompt_embeds,
+            model,
-                gpu_memory_utilization=0.7,
+            max_model_len=8192,
-                async_scheduling=async_scheduling,
+            enforce_eager=enforce_eager,
-                distributed_executor_backend=model_executor,
+            enable_prompt_embeds=enable_prompt_embeds,
-            ) as vllm_model:
+            gpu_memory_utilization=0.7,
-                if enable_prompt_embeds:
+            async_scheduling=async_scheduling,
-                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+            distributed_executor_backend=model_executor,
-                    vllm_outputs = _fix_prompt_embed_outputs(
+            attention_config={"backend": backend},
-                        vllm_outputs, hf_model, example_prompts
+            block_size=64,
-                    )
+        ) as vllm_model:
-                else:
+            if enable_prompt_embeds:
-                    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-        else:
+                vllm_outputs = _fix_prompt_embed_outputs(
-            with VllmRunner(
+                    vllm_outputs, hf_model, example_prompts
-                model,
+                )
-                max_model_len=8192,
+            else:
-                enforce_eager=enforce_eager,
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
-                async_scheduling=async_scheduling,
-                distributed_executor_backend=model_executor,
-                block_size=64,
-            ) as vllm_model:
-                if enable_prompt_embeds:
-                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                    vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts
-                    )
-                else:
-                    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        check_outputs_equal(
+    check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+        outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+        outputs_1_lst=vllm_outputs,
-            name_0="hf",
+        name_0="hf",
-            name_1="vllm",
+        name_1="vllm",
-        )
+    )
 # @multi_gpu_test(num_gpus=2)
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
 #     [
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {}),
+#         ("facebook/opt-125m", "ray", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {}),
+#         ("facebook/opt-125m", "mp", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+#         ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+#         ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "ray", "", "L4", {}),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "mp", "", "L4", {}),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100", {}),
+#         ("facebook/opt-125m", "ray", "", "A100", {}),
-#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100", {}),
+#         ("facebook/opt-125m", "mp", "", "A100", {}),
 #     ],
 # )
 # @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
@@ -186,12 +183,6 @@ def test_models(
 #         ):  # noqa
 #             pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
-#         if attention_backend:
-#             monkeypatch_context.setenv(
-#                 "VLLM_ATTENTION_BACKEND",
-#                 attention_backend,
-#             )
 #         for k, v in extra_env.items():
 #             monkeypatch_context.setenv(k, v)
@@ -203,6 +194,7 @@ def test_models(
 #         # if we run HF first, the cuda initialization will be done and it
 #         # will hurt multiprocessing backend with fork method
 #         # (the default method).
+#         attention_config = {"backend": attention_backend} if attention_backend else None
 #         with vllm_runner(
 #             model,
 #             dtype=dtype,
@@ -210,6 +202,7 @@ def test_models(
 #             distributed_executor_backend=distributed_executor_backend,
 #             enable_prompt_embeds=enable_prompt_embeds,
 #             gpu_memory_utilization=0.7,
+#             attention_config=attention_config,
 #         ) as vllm_model:
 #             if enable_prompt_embeds:
 #                 with hf_runner(model, dtype=dtype) as hf_model:
@@ -225,90 +218,12 @@ def test_models(
 #                 with hf_runner(model, dtype=dtype) as hf_model:
 #                     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-# @multi_gpu_test(num_gpus=2)
+#     check_outputs_equal(
-# @pytest.mark.parametrize(
+#         outputs_0_lst=hf_outputs,
-#     "model, distributed_executor_backend, attention_backend, "
+#         outputs_1_lst=vllm_outputs,
-#     "test_suite, extra_env", [
+#         name_0="hf",
-#         ("distilbert/distilgpt2", "ray", "", "L4", {}),
+#         name_1="vllm",
-#         ("distilbert/distilgpt2", "mp", "", "L4", {}),
+#     )
-#         ("distilbert/distilgpt2", "ray", "", "L4", {
-#             "VLLM_SLEEP_WHEN_IDLE": "1"
-#         }),
-#         ("distilbert/distilgpt2", "mp", "", "L4", {
-#             "VLLM_SLEEP_WHEN_IDLE": "1"
-#         }),
-#         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
-#         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-#         ("distilbert/distilgpt2", "ray", "", "A100", {}),
-#         ("distilbert/distilgpt2", "mp", "", "A100", {}),
-#     ])
-# @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
-# def test_models_distributed(
-#     monkeypatch: pytest.MonkeyPatch,
-#     hf_runner,
-#     vllm_runner,
-#     example_prompts,
-#     model: str,
-#     distributed_executor_backend: str,
-#     attention_backend: str,
-#     test_suite: str,
-#     extra_env: dict[str, str],
-#     enable_prompt_embeds: bool,
-# ) -> None:
-#     if test_suite != TARGET_TEST_SUITE:
-#         pytest.skip(f"Skip test for {test_suite}")
-#     with monkeypatch.context() as monkeypatch_context:
-#         if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-#             if enable_prompt_embeds:
-#                 pytest.skip(
-#                     "enable_prompt_embeds does not work with ray compiled dag."
-#                 )
-#             monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-#             monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-#         if attention_backend:
-#             monkeypatch_context.setenv(
-#                 "VLLM_ATTENTION_BACKEND",
-#                 attention_backend,
-#             )
-#         for k, v in extra_env.items():
-#             monkeypatch_context.setenv(k, v)
-#         dtype = "half"
-#         max_tokens = 5
-#         # NOTE: take care of the order. run vLLM first, and then run HF.
-#         # vLLM needs a fresh new process without cuda initialization.
-#         # if we run HF first, the cuda initialization will be done and it
-#         # will hurt multiprocessing backend with fork method
-#         # (the default method).
-#         with vllm_runner(
-#                 model,
-#                 dtype=dtype,
-#                 tensor_parallel_size=2,
-#                 distributed_executor_backend=distributed_executor_backend,
-#                 enable_prompt_embeds=enable_prompt_embeds,
-#                 gpu_memory_utilization=0.7,
-#         ) as vllm_model:
-#             if enable_prompt_embeds:
-#                 with hf_runner(model, dtype=dtype) as hf_model:
-#                     with torch.no_grad():
-#                         prompt_embeds = hf_model.get_prompt_embeddings(
-#                             example_prompts)
-#                     vllm_outputs = vllm_model.generate_greedy(
-#                         prompt_embeds, max_tokens)
-#                     vllm_outputs = _fix_prompt_embed_outputs(
-#                         vllm_outputs, hf_model, example_prompts)
-#                     hf_outputs = hf_model.generate_greedy(
-#                         example_prompts, max_tokens)
-#             else:
-#                 vllm_outputs = vllm_model.generate_greedy(
-#                     example_prompts, max_tokens)
-#                 with hf_runner(model, dtype=dtype) as hf_model:
-#                     hf_outputs = hf_model.generate_greedy(
-#                         example_prompts, max_tokens)
 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -248,7 +248,6 @@ def test_deep_sleep_async():
 @requires_fp8
 def test_deep_sleep_fp8_kvcache():
-    GiB_bytes = 1 << 30
    model = "Qwen/Qwen2-0.5B"
    used_bytes_baseline = current_platform.get_current_memory_usage()

--- a/tests/tpu/__init__.py
+++ b/tests/tpu/__init__.py
--- a/tests/benchmarks/test_param_sweep.py
+++ b/tests/benchmarks/test_param_sweep.py
--- a/tests/benchmarks/sweep/test_serve_sla.py
+++ b/tests/benchmarks/sweep/test_serve_sla.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import Callable
+from pathlib import Path
+from unittest.mock import patch
+from vllm.benchmarks.sweep.param_sweep import ParameterSweepItem
+from vllm.benchmarks.sweep.serve_sla import _get_sla_run_path, solve_sla
+from vllm.benchmarks.sweep.server import ServerProcess
+from vllm.benchmarks.sweep.sla_sweep import (
+    SLACriterionBase,
+    SLALessThan,
+    SLALessThanOrEqualTo,
+    SLASweepItem,
+)
+def _set_return_value(
+    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
+):
+    """
+    Create a patch for run_sla with a specific function
+    indicating the relationship between the benchmark combination
+    (which includes the SLA variable) and the SLA criterion.
+    """
+    def mock_run_sla(
+        server: ServerProcess | None,
+        bench_cmd: list[str],
+        *,
+        serve_comb: ParameterSweepItem,
+        bench_comb: ParameterSweepItem,
+        iter_path: Path,
+        num_runs: int,
+        dry_run: bool,
+    ):
+        iter_data = var2metric(bench_comb)
+        summary_path = _get_sla_run_path(iter_path, run_number=None)
+        summary_path.parent.mkdir(parents=True, exist_ok=True)
+        with summary_path.open("w") as f:
+            json.dump(iter_data, f, indent=4)
+        return iter_data
+    return patch("vllm.benchmarks.sweep.serve_sla.run_sla", side_effect=mock_run_sla)
+def _var2metric_linear():
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = x
+        return [{"request_throughput": y}]
+    return wrapped
+def _var2metric_concave(elbow_point: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        if x < elbow_point:
+            y = 0.5 * (x - elbow_point) + elbow_point
+        else:
+            y = 1.5 * (x - elbow_point) + elbow_point
+        return [{"request_throughput": y}]
+    return wrapped
+def _var2metric_convex(elbow_point: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        if x < elbow_point:
+            y = 1.5 * (x - elbow_point) + elbow_point
+        else:
+            y = 0.5 * (x - elbow_point) + elbow_point
+        return [{"request_throughput": y}]
+    return wrapped
+def _var2metric_quadratic(y_intercept: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = y_intercept + 0.1 * x**2
+        return [{"request_throughput": y}]
+    return wrapped
+def _var2metric_sqrt(y_intercept: float):
+    def wrapped(bench_comb):
+        x = float(bench_comb["request_rate"])
+        y = y_intercept + 10 * x**0.5
+        return [{"request_throughput": y}]
+    return wrapped
+def _run_solve_sla(
+    var2metric: Callable[[ParameterSweepItem], list[dict[str, float]]],
+    criterion: SLACriterionBase,
+    base_path: Path,
+    min_value: int = 1,
+    max_value: int = 100,
+):
+    with _set_return_value(var2metric):
+        result = solve_sla(
+            server=None,
+            bench_cmd=[],
+            serve_comb=ParameterSweepItem(),
+            bench_comb=ParameterSweepItem(),
+            sla_comb=SLASweepItem({"request_throughput": criterion}),
+            base_path=base_path,
+            num_runs=1,
+            dry_run=False,
+            sla_variable="request_rate",
+            sla_min_value=min_value,
+            sla_max_value=max_value,
+        )
+        assert result is not None
+        return result
+def test_solve_linear_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=32),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 32
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        32: True,
+        33: False,
+    }
+def test_solve_linear_sla_lt(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThan(target=32),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 31
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        31: True,
+        32: False,
+    }
+def test_solve_linear_sla_oob(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=32),
+        tmp_path,
+        min_value=64,
+    )
+    assert history.get_max_passing() == 64
+    assert history.get_min_failing() == 64
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        64: False,
+    }
+def test_solve_concave_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_concave(elbow_point=32),
+        SLALessThanOrEqualTo(target=24),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 16
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        7: True,
+        13: True,
+        15: True,
+        16: True,
+        17: False,
+    }
+def test_solve_convex_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_convex(elbow_point=32),
+        SLALessThanOrEqualTo(target=24),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 26
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        48: False,
+        30: False,
+        24: True,
+        26: True,
+        27: False,
+    }
+def test_solve_quadratic_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_quadratic(y_intercept=10),
+        SLALessThanOrEqualTo(target=50),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 20
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        4: True,
+        20: True,
+        21: False,
+    }
+def test_solve_sqrt_sla_le(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_sqrt(y_intercept=10),
+        SLALessThanOrEqualTo(target=100),
+        tmp_path,
+    )
+    assert history.get_max_passing() == 81
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        100: False,
+        1: True,
+        89: False,
+        81: True,
+        82: False,
+    }
+def test_solve_reuse_history(tmp_path):
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=10),
+        tmp_path,
+        min_value=1,
+        max_value=20,
+    )
+    assert history.get_max_passing() == 10
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        20: False,
+        1: True,
+        10: True,
+        11: False,
+    }
+    sla_data, history = _run_solve_sla(
+        _var2metric_linear(),
+        SLALessThanOrEqualTo(target=30),
+        tmp_path,
+        min_value=21,
+        max_value=40,
+    )
+    assert history.get_max_passing() == 30
+    assert {val: margin <= 0 for val, margin in history.items()} == {
+        # Items from the past run
+        # (the margins are different because the target changed)
+        20: True,
+        1: True,
+        10: True,
+        11: True,
+        # Items from this run
+        40: False,
+        30: True,
+        31: False,
+    }
--- a/tests/benchmarks/test_bench_startup.py
+++ b/tests/benchmarks/test_bench_startup.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+import pytest
+@pytest.mark.benchmark
+def test_bench_startup():
+    command = [
+        "vllm",
+        "bench",
+        "startup",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -20,21 +20,18 @@ def server():
 @pytest.mark.benchmark
 def test_bench_serve(server):
+    # Test default model detection and input/output len
    command = [
        "vllm",
        "bench",
        "serve",
-        "--model",
-        MODEL_NAME,
        "--host",
        server.host,
        "--port",
        str(server.port),
-        "--dataset-name",
+        "--input-len",
-        "random",
-        "--random-input-len",
        "32",
-        "--random-output-len",
+        "--output-len",
        "4",
        "--num-prompts",
        "5",

--- a/tests/compile/distributed/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -15,6 +15,7 @@ from vllm.config import (
    ModelConfig,
    PassConfig,
    VllmConfig,
+    set_current_vllm_config,
 )
 from vllm.distributed import (
    tensor_model_parallel_all_gather,
@@ -26,6 +27,7 @@ from vllm.distributed.parallel_state import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 from ...models.registry import HF_EXAMPLE_MODELS
 from ...utils import (
@@ -301,7 +303,7 @@ def async_tp_pass_on_test_model(
    dtype: torch.dtype,
    dynamic: bool,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)
@@ -339,38 +341,42 @@ def async_tp_pass_on_test_model(
    )
    async_tp_pass = AsyncTPPass(vllm_config)
-    backend = TestBackend(async_tp_pass)
-    assert (
+    # Set the global vllm_config for TestBackend which calls
-        async_tp_pass.compilation_config.splitting_ops
+    # get_current_vllm_config()
-        == vllm_config.compilation_config.splitting_ops
+    with set_current_vllm_config(vllm_config):
-    )
+        backend = TestBackend(async_tp_pass)
-    assert (
-        async_tp_pass.compilation_config.use_inductor_graph_partition
-        == vllm_config.compilation_config.use_inductor_graph_partition
-    )
-    model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
+        assert (
+            async_tp_pass.compilation_config.splitting_ops
+            == vllm_config.compilation_config.splitting_ops
+        )
+        assert (
+            async_tp_pass.compilation_config.use_inductor_graph_partition
+            == vllm_config.compilation_config.use_inductor_graph_partition
+        )
-    hidden_states = torch.randn(
+        model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
-        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
-    )
+        hidden_states = torch.randn(
+            (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+        )
-    if dynamic:
+        if dynamic:
-        torch._dynamo.mark_dynamic(hidden_states, 0)
+            torch._dynamo.mark_dynamic(hidden_states, 0)
-    compiled_model = torch.compile(model, backend=backend)
+        compiled_model = torch.compile(model, backend=backend)
-    compiled_model(hidden_states)
+        compiled_model(hidden_states)
-    assert async_tp_pass.matched_count == 1
+        assert async_tp_pass.matched_count == 1
-    # In pre-nodes, all gather or reduce scatter should exist,
+        # In pre-nodes, all gather or reduce scatter should exist,
-    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+        # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
-    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
-    # In post-nodes, fused_matmul_reduce_scatter or \
+        # In post-nodes, fused_matmul_reduce_scatter or \
-    # fused_all_gather_matmul should exist
+        # fused_all_gather_matmul should exist
-    backend.check_after_ops(model.ops_in_model_after())
+        backend.check_after_ops(model.ops_in_model_after())
 @create_new_process_for_each_test()

--- a/tests/compile/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/distributed/test_fusion_all_reduce.py
@@ -32,6 +32,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 from ...utils import has_module_attribute, multi_gpu_test
 from ..backend import TestBackend
@@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
    enable_rms_norm_custom_op,
    enable_quant_fp8_custom_op,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)

--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -208,7 +208,8 @@ def test_attn_quant(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -554,7 +557,8 @@ def test_rms_group_quant(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -564,7 +568,9 @@ def test_rms_group_quant(
        splitting_ops=splitting_ops,
        # Common
        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        pass_config=PassConfig(
+            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+        ),
        # Inductor caches custom passes by default as well via uuid
        inductor_compile_config={"force_disable_caches": True},
    )

--- a/tests/compile/distributed/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
+from vllm.utils.torch_utils import set_random_seed
 from ...utils import multi_gpu_test
 from ..backend import TestBackend
@@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
    fuse_norm_quant: bool,
    dynamic: bool,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)

--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -6,10 +6,13 @@ import pytest
 import os
 from vllm.config import CompilationMode
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import cuda_device_count_stateless
 from ...utils import compare_all_settings, models_path_prefix
+ATTN_BACKEND = "FLASH_ATTN" if not current_platform.is_rocm() else "ROCM_ATTN"
 @dataclasses.dataclass
 class TestSetting:
@@ -32,7 +35,7 @@ class TestSetting:
            model_args=["--max-model-len", "2048"],
            pp_size=2,
            tp_size=2,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
            method="generate",
        ),
        # llama model with quantization
@@ -41,7 +44,7 @@ class TestSetting:
            model_args=["--quantization", "gptq", "--max-model-len", "2048"],
            pp_size=1,
            tp_size=1,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
            method="generate",
        ),
        # MoE model
@@ -50,7 +53,7 @@ class TestSetting:
            model_args=["--max-model-len", "2048"],
            pp_size=1,
            tp_size=2,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
            method="generate",
        ),
        # embedding model
@@ -66,18 +69,23 @@ class TestSetting:
            ],
            pp_size=1,
            tp_size=1,
-            attn_backend="FLASH_ATTN",
+            attn_backend=ATTN_BACKEND,
            method="encode",
        ),
-        # # TODO
+        pytest.param(
-        # TestSetting(
+            TestSetting(
-        #     model="BAAI/bge-base-en-v1.5",
+                model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--runner", "pooling"],
+                model_args=["--runner", "pooling"],
-        #     pp_size=1,
+                pp_size=1,
-        #     tp_size=1,
+                tp_size=1,
-        #     attn_backend="FLASH_ATTN",
+                attn_backend="FLASH_ATTN",
-        #     method="encode",
+                method="encode",
-        # ),
+            ),
+            marks=pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason="Encoder self-attention is not implemented for ROCm",
+            ),
+        ),
        # vision language model
        # See https://github.com/vllm-project/vllm/issues/26716.
        # TestSetting(
@@ -91,7 +99,6 @@ class TestSetting:
    ],
 )
 def test_compile_correctness(
-    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,
 ):
    # this test is run under multiple suits, with different GPUs.
@@ -109,49 +116,48 @@ def test_compile_correctness(
            f"{cuda_device_count_stateless()}"
        )
-    with monkeypatch.context() as m:
+    final_args = [
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        *model_args,
-        final_args = [
+        "-pp",
-            *model_args,
+        str(pp_size),
-            "-pp",
+        "-tp",
-            str(pp_size),
+        str(tp_size),
-            "-tp",
+        "-cc.cudagraph_mode=none",
-            str(tp_size),
+        f"--attention-backend={attn_backend}",
-            "-cc.cudagraph_mode=none",
+    ]
-        ]
-        all_args: list[list[str]] = []
-        all_envs: list[dict[str, str] | None] = []
-        for comp_mode in [
+    all_args: list[list[str]] = []
-            CompilationMode.STOCK_TORCH_COMPILE,
+    all_envs: list[dict[str, str] | None] = []
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
-                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
-                )
-            # inductor will change the output, so we only compare if the output
+    for comp_mode in [
-            # is close, not exactly the same.
+        CompilationMode.STOCK_TORCH_COMPILE,
-            compare_all_settings(
+        CompilationMode.DYNAMO_TRACE_ONCE,
-                model,
+        CompilationMode.VLLM_COMPILE,
-                all_args,
+    ]:
-                all_envs,
+        for mode in [CompilationMode.NONE, comp_mode]:
-                method=method if method != "generate" else "generate_close",
+            all_args.append(
+                final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
            )
-            all_envs.clear()
-            all_args.clear()
-        for mode in [
+        # inductor will change the output, so we only compare if the output
-            CompilationMode.NONE,
+        # is close, not exactly the same.
-            CompilationMode.STOCK_TORCH_COMPILE,
+        compare_all_settings(
-            CompilationMode.DYNAMO_TRACE_ONCE,
+            model,
-            CompilationMode.VLLM_COMPILE,
+            all_args,
-        ]:
+            all_envs,
-            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+            method=method if method != "generate" else "generate_close",
-            all_envs.append({})
+        )
-            all_envs.append({})
+        all_envs.clear()
+        all_args.clear()
+    for mode in [
+        CompilationMode.NONE,
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+        all_envs.append({})
+        all_envs.append({})
-        compare_all_settings(model, all_args * 3, all_envs, method=method)
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -12,6 +12,7 @@ from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 @contextlib.contextmanager
@@ -70,11 +71,14 @@ def llm_pair(request):
        elif backend_config.specific_gpu_arch == (10, 0):
            pytest.skip("Only Blackwell GPUs support Cutlass MLA")
+    # FlashInfer is not supported on ROCm
+    if backend_config == AttentionBackendEnum.FLASHINFER and current_platform.is_rocm():
+        pytest.skip("FlashInfer is not supported on ROCm")
    env_vars = {
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
-        **backend_config.env_vars,
    }
    with temporary_environ(env_vars):
        full = LLM(
@@ -170,16 +174,10 @@ class TestFullCUDAGraph:
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with (
+    # Flex_Attention is not supported with full cuda graph
-        temporary_environ(
+    with pytest.raises(RuntimeError):
-            {
-                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
-                # Flex_Attention is not supported with full cuda graph
-            }
-        ),
-        pytest.raises(RuntimeError),
-    ):
        LLM(
            model="Qwen/Qwen2-1.5B-Instruct",
            compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+            attention_config={"backend": "FLEX_ATTENTION"},
        )
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,10 +10,10 @@ import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from ...utils import create_new_process_for_each_test
@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
            TEST_MODELS.append(
                (
                    "alexm-nm/tinyllama-24-marlin24-4bit-g128",
-                    {"quantization": "gptq_marlin_24"},
+                    {
+                        "quantization": "gptq_marlin_24",
+                        "allow_deprecated_quantization": True,
+                    },
                )
            )
@@ -156,6 +159,20 @@ def test_full_graph(
        )
        for model_info in models_list(all=False)
        if is_torch_equal_or_newer("2.9.0.dev")
+    ]
+    + [
+        # Test get_raw_stream patch with compile_sizes
+        # This tests that TorchInductor autotune works correctly with get_raw_stream
+        # patch in torch 2.9 and without patch in torch 2.10+
+        (
+            CompilationConfig(
+                mode=CompilationMode.VLLM_COMPILE,
+                compile_sizes=[1, 2],  # Triggers autotune which uses get_raw_stream
+                cudagraph_mode=CUDAGraphMode.NONE,
+            ),
+            "facebook/opt-125m",
+            {},
+        ),
    ],
 )
 # only test some of the models
@@ -197,20 +214,19 @@ def test_custom_compile_config(
    ],
 )
 def test_fp8_kv_scale_compile(
-    monkeypatch: pytest.MonkeyPatch,
    compilation_mode: int,
    model: str,
    backend: AttentionBackendEnum | None,
 ):
-    if backend:
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
    model_kwargs = {
        "quantization": "fp8",
        "kv_cache_dtype": "fp8_e4m3",
        "calculate_kv_scales": True,
        "max_model_len": 512,
    }
+    if backend:
+        model_kwargs["attention_config"] = {"backend": backend.name}
    run_model(compilation_mode, model, **model_kwargs)

--- a/tests/compile/fullgraph/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -71,3 +71,40 @@ def test_qwen2_5_vl_no_vit_compilation(vllm_runner, monkeypatch):
        ) as _,
    ):
        pass
+# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
+# Requires Cuda and 8 gpus as well
+@pytest.mark.forked
+@pytest.mark.skip(reason="Skipping due to CI resource constraints")
+def test_mllama4_vit_compilation(vllm_runner, monkeypatch):
+    """Test that Mllama4 vision submodules are compiled.
+    This test verifies that the 2 vision submodules (Llama4VisionEncoder,
+    Llama4VisionPixelShuffleMLP) are properly tagged
+    for compilation by checking that num_models_seen increases to 3.
+    However since we are using TP=8, we compilation_counter will not
+    work properly so we will just check the run succeeds rn
+    """
+    # Disable multiprocessing so that the counter is in the same process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    with (
+        monkeypatch.context(),
+        # TODO: Since we require TP=8, this messes with the compilation
+        # counter. We should fix this in the future, but leave for now
+        # to make sure that compilation runs (no crash) with llama vision encoder
+        compilation_counter.expect(num_models_seen=0),
+        vllm_runner(
+            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            max_model_len=512,
+            gpu_memory_utilization=0.8,
+            tensor_parallel_size=8,
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
+        ),
+    ):
+        pass