Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,7 @@

 import ctypes
 import importlib.util
+import json
 import logging
 import os
 import re
@@ -9,13 +10,11 @@ import subprocess
 import sys
 from pathlib import Path
 from shutil import which
-from typing import Dict, List

 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
-from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME

 from typing import Optional, Union
@@ -35,7 +34,7 @@ def load_module_from_path(module_name, path):
    return module


-ROOT_DIR = os.path.dirname(__file__)
+ROOT_DIR = Path(__file__).parent
 logger = logging.getLogger(__name__)

 # cannot import envs directly because it depends on vllm,
@@ -62,7 +61,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
    # fallback to cpu
    VLLM_TARGET_DEVICE = "cpu"

-MAIN_CUDA_VERSION = "12.1"
+MAIN_CUDA_VERSION = "12.4"


 def is_sccache_available() -> bool:
@@ -77,6 +76,18 @@ def is_ninja_available() -> bool:
    return which("ninja") is not None


+def is_url_available(url: str) -> bool:
+    from urllib.request import urlopen
+
+    status = None
+    try:
+        with urlopen(url) as f:
+            status = f.status
+    except Exception:
+        return False
+    return status == 200
+
+
 class CMakeExtension(Extension):

    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
@@ -86,7 +97,7 @@ class CMakeExtension(Extension):

 class cmake_build_ext(build_ext):
    # A dict of extension directories that have been configured.
-    did_config: Dict[str, bool] = {}
+    did_config: dict[str, bool] = {}

    #
    # Determine number of compilation jobs and optionally nvcc compile threads.
@@ -278,16 +289,43 @@ class repackage_wheel(build_ext):
    """Extracts libraries and other files from an existing wheel."""

    def get_base_commit_in_main_branch(self) -> str:
-        import subprocess
+        # Force to use the nightly wheel. This is mainly used for CI testing.
+        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
+            return "nightly"

        try:
+            # Get the latest commit hash of the upstream main branch.
+            resp_json = subprocess.check_output([
+                "curl", "-s",
+                "https://api.github.com/repos/vllm-project/vllm/commits/main"
+            ]).decode("utf-8")
+            upstream_main_commit = json.loads(resp_json)["sha"]
+
+            # Check if the upstream_main_commit exists in the local repo
+            try:
+                subprocess.check_output(
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+            except subprocess.CalledProcessError:
+                # If not present, fetch it from the remote repository.
+                # Note that this does not update any local branches,
+                # but ensures that this commit ref and its history are
+                # available in our local repo.
+                subprocess.check_call([
+                    "git", "fetch", "https://github.com/vllm-project/vllm",
+                    "main"
+                ])
+
+            # Then get the commit hash of the current branch that is the same as
+            # the upstream main commit.
            current_branch = subprocess.check_output(
                ["git", "branch", "--show-current"]).decode("utf-8").strip()

-            base_commit = subprocess.check_output(
-                ["git", "merge-base", "main",
-                 current_branch]).decode("utf-8").strip()
+            base_commit = subprocess.check_output([
+                "git", "merge-base", f"{upstream_main_commit}", current_branch
+            ]).decode("utf-8").strip()
            return base_commit
+        except ValueError as err:
+            raise ValueError(err) from None
        except Exception as err:
            logger.warning(
                "Failed to get the base commit in the main branch. "
@@ -303,6 +341,10 @@ class repackage_wheel(build_ext):
        if wheel_location is None:
            base_commit = self.get_base_commit_in_main_branch()
            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            # Fallback to nightly wheel if latest commit wheel is unavailable,
+            # in this rare case, the nightly release CI hasn't finished on main.
+            if not is_url_available(wheel_location):
+                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"

        import zipfile

@@ -336,6 +378,7 @@ class repackage_wheel(build_ext):
            files_to_copy = [
                "vllm/_C.abi3.so",
                "vllm/_moe_C.abi3.so",
+                "vllm/_flashmla_C.abi3.so",
                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                "vllm/vllm_flash_attn/flash_attn_interface.py",
@@ -413,10 +456,6 @@ def _is_cpu() -> bool:
    return VLLM_TARGET_DEVICE == "cpu"


-def _is_openvino() -> bool:
-    return VLLM_TARGET_DEVICE == "openvino"
-
-
 def _is_xpu() -> bool:
    return VLLM_TARGET_DEVICE == "xpu"

@@ -486,10 +525,6 @@ def get_nvcc_cuda_version() -> Version:
    return nvcc_cuda_version


-def get_path(*filepath) -> str:
-    return os.path.join(ROOT_DIR, *filepath)
-
-
 def get_sha(root: Union[str, Path]) -> str:
    try:
        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
@@ -525,9 +560,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    
    new_version_content = f"""
 try:
-    __version__ = "0.7.3"
-    __version_tuple__ = (0, 7, 3)
-    __hcu_version__ = f'0.7.3+{version}' 
+    __version__ = "0.8.2"
+    __version_tuple__ = (0, 8, 2)
+    __hcu_version__ = f'0.8.2+{version}' 
    
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -571,6 +606,7 @@ def get_gaudi_sw_version():

 def get_vllm_version() -> str:
    if not _is_hip():
+        from setuptools_scm import get_version
        version = get_version(write_to="vllm/_version.py")
        sep = "+" if "+" not in version else "."  # dev versions might contain +

@@ -605,8 +641,6 @@ def get_vllm_version() -> str:
        if gaudi_sw_version != MAIN_CUDA_VERSION:
            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
            version += f"{sep}gaudi{gaudi_sw_version}"
-    elif _is_openvino():
-        version += f"{sep}openvino"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@@ -620,11 +654,12 @@ def get_vllm_version() -> str:
    return version


-def get_requirements() -> List[str]:
+def get_requirements() -> list[str]:
    """Get Python package dependencies from requirements.txt."""
+    requirements_dir = ROOT_DIR / "requirements"

-    def _read_requirements(filename: str) -> List[str]:
-        with open(get_path(filename)) as f:
+    def _read_requirements(filename: str) -> list[str]:
+        with open(requirements_dir / filename) as f:
            requirements = f.read().strip().split("\n")
        resolved_requirements = []
        for line in requirements:
@@ -637,37 +672,34 @@ def get_requirements() -> List[str]:
        return resolved_requirements

    if _no_device():
-        requirements = _read_requirements("requirements-common.txt")
+        requirements = _read_requirements("common.txt")
    elif _is_cuda():
-        requirements = _read_requirements("requirements-cuda.txt")
+        requirements = _read_requirements("cuda.txt")
        cuda_major, cuda_minor = torch.version.cuda.split(".")
        modified_requirements = []
        for req in requirements:
-            if ("vllm-flash-attn" in req
-                    and not (cuda_major == "12" and cuda_minor == "1")):
-                # vllm-flash-attn is built only for CUDA 12.1.
+            if ("vllm-flash-attn" in req and cuda_major != "12"):
+                # vllm-flash-attn is built only for CUDA 12.x.
                # Skip for other versions.
                continue
            modified_requirements.append(req)
        requirements = modified_requirements
    elif _is_hip():
-        requirements = _read_requirements("requirements-rocm.txt")
+        requirements = _read_requirements("rocm.txt")
    elif _is_neuron():
-        requirements = _read_requirements("requirements-neuron.txt")
+        requirements = _read_requirements("neuron.txt")
    elif _is_hpu():
-        requirements = _read_requirements("requirements-hpu.txt")
-    elif _is_openvino():
-        requirements = _read_requirements("requirements-openvino.txt")
+        requirements = _read_requirements("hpu.txt")
    elif _is_tpu():
-        requirements = _read_requirements("requirements-tpu.txt")
+        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
-        requirements = _read_requirements("requirements-cpu.txt")
+        requirements = _read_requirements("cpu.txt")
    elif _is_xpu():
-        requirements = _read_requirements("requirements-xpu.txt")
+        requirements = _read_requirements("xpu.txt")
    else:
        raise ValueError(
            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
-            "OpenVINO, or CPU.")
+            "or CPU.")
    return requirements


@@ -685,6 +717,11 @@ if _is_cuda():
        # FA3 requires CUDA 12.0 or later
        ext_modules.append(
            CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # Optional since this doesn't get built (produce an .so file) when
+        # not targeting a hopper system
+        ext_modules.append(
+            CMakeExtension(name="vllm._flashmla_C", optional=True))
    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))

 if _build_custom_ops():
@@ -719,6 +756,7 @@ setup(
    install_requires=get_requirements(),
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": ["decord"]  # Required for video processing

--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 # SPDX-License-Identifier: Apache-2.0
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict, Iterable
+from collections.abc import Iterable
+from typing import Any

 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
        self._num_aborts += len(ids)
        await super()._engine_abort(ids)

-    def testing_stats(self) -> Dict[str, Any]:
+    def testing_stats(self) -> dict[str, Any]:
        return {"num_aborted_requests": self._num_aborts}



--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import subprocess
 import sys
 import time
@@ -46,7 +47,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
        distributed_executor_backend,
    ]

-    uvicorn_process = subprocess.Popen(commands)
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
    yield
    uvicorn_process.terminate()


--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -6,7 +6,7 @@ import uuid
 from asyncio import CancelledError
 from copy import copy
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional

 import pytest
 import pytest_asyncio
@@ -152,6 +152,10 @@ def uid() -> str:

 @pytest_asyncio.fixture(scope="module")
 async def async_engine():
+    # We cannot use monkeypatch since this is a module
+    # scoped fixture and monkeypatch is function scoped.
+    previous_value = os.getenv("VLLM_USE_V1", None)
+    os.environ["VLLM_USE_V1"] = "0"
    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
                                                            func=start_engine)
    try:
@@ -162,6 +166,11 @@ async def async_engine():
        await asyncio.sleep(0.1)
        cleanup_dist_env_and_memory()

+        if previous_value:
+            os.environ["VLLM_USE_V1"] = previous_value
+        else:
+            del os.environ["VLLM_USE_V1"]
+

 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
@@ -255,7 +264,7 @@ async def test_output_kinds(async_engine, stop):
        params.output_kind = RequestOutputKind.DELTA

        prompt_tokens = None
-        output_tokens: List[int] = []
+        output_tokens: list[int] = []
        output_text = ""
        output_count = 0
        final_output = None

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,7 +9,6 @@ import weakref
 import pytest

 from vllm import LLM
-from vllm.config import LoadFormat
 from vllm.platforms import current_platform

 from ..conftest import VllmRunner
@@ -36,7 +35,7 @@ def v1(run_with_both_engines):

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), load_format=LoadFormat.RUNAI_STREAMER)
+    llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@@ -45,12 +44,12 @@ def test_vllm_gc_ed():


 @pytest.mark.parametrize("model", MODELS)
-# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
+    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@@ -63,51 +62,54 @@ def test_models(
        pytest.skip("Flashinfer does not support ROCm/HIP.")

    if backend in ("XFORMERS",
-                   "FLASHINFER") and model == "google/gemma-2-2b-it":
+                   "FLASHINFER") and model == os.path.join(models_path_prefix, "google/gemma-2-2b-it"):
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")

-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)

-    # 5042 tokens for gemma2
-    # gemma2 has alternating sliding window size of 4096
-    # we need a prompt with more than 4096 tokens to test the sliding window
-    prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
+        # 5042 tokens for gemma2
+        # gemma2 has alternating sliding window size of 4096
+        # we need a prompt with more than 4096 tokens to test the sliding window
+        prompt = "The following numbers of the sequence " + ", ".join(
+            str(i) for i in range(1024)) + " are:"
+        example_prompts = [prompt]

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    with VllmRunner(model,
-                    max_model_len=8192,
-                    dtype=dtype,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with VllmRunner(model,
+                        max_model_len=8192,
+                        dtype=dtype,
+                        enforce_eager=enforce_eager,
+                        gpu_memory_utilization=0.7) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 # @multi_gpu_test(num_gpus=2)
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, "
 #     "test_suite", [
-#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "L4"),
-#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "L4"),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
-#         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
-#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "ray", "", "A100"),
-#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "", "A100"),
-#         (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "mp", "FLASHINFER", "A100"),
-#         (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
+#         ("distilbert/distilgpt2", "ray", "", "L4"),
+#         ("distilbert/distilgpt2", "mp", "", "L4"),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+#         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
+#         ("distilbert/distilgpt2", "ray", "", "A100"),
+#         ("distilbert/distilgpt2", "mp", "", "A100"),
+#         ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
+#         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
 #     ])
 # def test_models_distributed(
+#     monkeypatch: pytest.MonkeyPatch,
 #     hf_runner,
 #     vllm_runner,
 #     example_prompts,
@@ -120,35 +122,41 @@ def test_models(
 #     if test_suite != TARGET_TEST_SUITE:
 #         pytest.skip(f"Skip test for {test_suite}")

-#     if model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-#         # test ray adag
-#         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-#         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-#     if attention_backend:
-#         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-
-#     dtype = "half"
-#     max_tokens = 5
-
-#     # NOTE: take care of the order. run vLLM first, and then run HF.
-#     # vLLM needs a fresh new process without cuda initialization.
-#     # if we run HF first, the cuda initialization will be done and it
-#     # will hurt multiprocessing backend with fork method (the default method).
-#     with vllm_runner(model,
-#                      dtype=dtype,
-#                      tensor_parallel_size=2,
-#                      distributed_executor_backend=distributed_executor_backend
-#                      ) as vllm_model:
-#         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-#     with hf_runner(model, dtype=dtype) as hf_model:
-#         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-#     check_outputs_equal(
-#         outputs_0_lst=hf_outputs,
-#         outputs_1_lst=vllm_outputs,
-#         name_0="hf",
-#         name_1="vllm",
-#     )
-
+#     with monkeypatch.context() as monkeypatch_context:
+#         if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+#             # test Ray Compiled Graph
+#             monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+#             monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+#         if attention_backend:
+#             monkeypatch_context.setenv(
+#                 "VLLM_ATTENTION_BACKEND",
+#                 attention_backend,
+#             )
+
+#         dtype = "half"
+#         max_tokens = 5
+
+#         # NOTE: take care of the order. run vLLM first, and then run HF.
+#         # vLLM needs a fresh new process without cuda initialization.
+#         # if we run HF first, the cuda initialization will be done and it
+#         # will hurt multiprocessing backend with fork method
+#         # (the default method).
+#         with vllm_runner(
+#                 model,
+#                 dtype=dtype,
+#                 tensor_parallel_size=2,
+#                 distributed_executor_backend=distributed_executor_backend,
+#         ) as vllm_model:
+#             vllm_outputs = vllm_model.generate_greedy(example_prompts,
+#                                                       max_tokens)
+
+#         with hf_runner(model, dtype=dtype) as hf_model:
+#             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+#         check_outputs_equal(
+#             outputs_0_lst=hf_outputs,
+#             outputs_1_lst=vllm_outputs,
+#             name_0="hf",
+#             name_1="vllm",
+#         )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,24 +7,41 @@ prefill requests are chunked.

 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING

 import pytest

-from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 import os
 from ..utils import models_path_prefix

+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
 MODELS = [
    os.path.join(models_path_prefix, "facebook/opt-125m"),
    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
 ]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the file.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -35,8 +52,8 @@ MODELS = [
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@@ -45,37 +62,39 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)

-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
+        max_num_seqs = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 @multi_gpu_test(num_gpus=2)
@@ -83,57 +102,61 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    if (model == "meta-llama/Llama-3.2-1B-Instruct"
-            and distributed_executor_backend == "ray"):
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+                and distributed_executor_backend == "ray"):
+            # test Ray Compiled Graph
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        dtype = "half"
+        max_tokens = 5
+        chunked_prefill_token_size = 16
+
+        # Add a chunked prefill config.
+        max_num_seqs = min(chunked_prefill_token_size, 256)
+        assert chunked_prefill_token_size != -1
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with
+        # fork method (the default method).

-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )


 # @pytest.mark.parametrize(
@@ -151,7 +174,7 @@ def test_models_distributed(
 # # the async postprocessor
 # @pytest.mark.parametrize("disable_async_output_proc", [True])
 # def test_models_with_fp8_kv_cache(
-#     vllm_runner,
+#     vllm_runner: VllmRunner,
 #     example_prompts,
 #     kv_cache_dtype: str,
 #     model: str,
@@ -211,7 +234,7 @@ def test_models_distributed(
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@@ -247,8 +270,10 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                              max_tokens)
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )

    check_outputs_equal(
        outputs_0_lst=outputs[False],
@@ -259,7 +284,7 @@ def test_with_prefix_caching(


 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
 @pytest.mark.parametrize("enforce_eager", [False])
@@ -267,8 +292,8 @@ def test_with_prefix_caching(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@@ -276,7 +301,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    test_models(
        hf_runner,
@@ -296,11 +321,11 @@ def test_models_cpu(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,

--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
 # SPDX-License-Identifier: Apache-2.0

 import os
+import pytest
+
 from ..utils import compare_two_settings, models_path_prefix


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def test_cpu_offload():
    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), [],
                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,15 +4,13 @@ import pytest
 import torch

 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes

-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_python_error():
    """
    Test if Python error occurs when there's low-level
@@ -38,7 +36,7 @@ def test_python_error():
        allocator.wake_up()


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_basic_cumem():
    # some tensors from default memory pool
    shape = (1024, 1024)
@@ -71,7 +69,7 @@ def test_basic_cumem():
    assert torch.allclose(output, torch.ones_like(output) * 3)


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_cumem_with_cudagraph():
    allocator = CuMemAllocator.get_instance()
    with allocator.use_memory_pool():
@@ -116,43 +114,47 @@ def test_cumem_with_cudagraph():
    assert torch.allclose(y, x + 1)


-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize(
    "model, use_v1",
    [
        # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
+        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model: str, use_v1: bool):
-    import os
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-    free, total = torch.cuda.mem_get_info()
-    used_bytes_baseline = total - free  # in case other process is running
-    load_format = LoadFormat.AUTO
-    if "Llama" in model:
-        load_format = LoadFormat.RUNAI_STREAMER
-    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
-    prompt = "How are you?"
-    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    output = llm.generate(prompt, sampling_params)
-
-    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-    # which is difficult to measure in the test. therefore, we only
-    # test sleep level 1 here.
-    llm.sleep(level=1)
-
-    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-    # now the memory usage is mostly cudagraph memory pool,
-    # and it should be less than the model weights (1B model, 2GiB weights)
-    assert used_bytes < 2 * GiB_bytes
-
-    llm.wake_up()
-    output2 = llm.generate(prompt, sampling_params)
-
-    # cmp output
-    assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-    del os.environ["VLLM_USE_V1"]
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        llm = LLM(model, enable_sleep_mode=True)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        output = llm.generate(prompt, sampling_params)
+
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        llm.sleep(level=1)
+
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
+
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
+
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -24,6 +24,15 @@ MODELS = [
 ]


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
+    so use VLLM_USE_V1=0 for all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
    assert ENABLE_ARTIFICIAL_PREEMPT is True, (

--- a/tests/build_cython.py
+++ b/tests/build_cython.py
+# SPDX-License-Identifier: Apache-2.0
+import Cython.Compiler.Options
+from Cython.Build import cythonize
+from setuptools import setup
+
+Cython.Compiler.Options.annotate = True
+
+infiles = []
+
+infiles += [
+    "vllm/engine/llm_engine.py",
+    "vllm/transformers_utils/detokenizer.py",
+    "vllm/engine/output_processor/single_step.py",
+    "vllm/outputs.py",
+    "vllm/engine/output_processor/stop_checker.py",
+]
+
+infiles += [
+    "vllm/core/scheduler.py",
+    "vllm/sequence.py",
+    "vllm/core/block_manager.py",
+]
+
+infiles += [
+    "vllm/model_executor/layers/sampler.py",
+    "vllm/sampling_params.py",
+    "vllm/utils.py",
+]
+
+setup(ext_modules=cythonize(infiles,
+                            annotate=False,
+                            force=True,
+                            compiler_directives={
+                                'language_level': "3",
+                                'infer_types': True
+                            }))
+
+# example usage: python3 build_cython.py build_ext --inplace
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -6,6 +6,7 @@ from typing import Callable, Union
 from torch import fx

 from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import get_current_vllm_config


 class TestBackend:
@@ -13,21 +14,27 @@ class TestBackend:
    This class provides a simple Inductor backend that can be used for testing.
    It takes a list of custom passes and runs them after Inductor's passes.
    It also saves the graph before and after the custom passes for inspection.
+
+    Inductor config can be modified directly by editing the inductor_config
+    property. This can be helpful for adding passes like the
+    'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
+    Inductor config is default-initialized from VllmConfig.CompilationConfig.
    """

    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
                                                             None]]):
        self.custom_passes = list(passes)
-        from torch._inductor import config
-        self.current_config = config.shallow_copy_dict()
-        self.current_config['force_disable_caches'] = True
-        self.current_config['post_grad_custom_post_pass'] = self.post_pass
+        compile_config = get_current_vllm_config().compilation_config
+        self.inductor_config = compile_config.inductor_compile_config
+        self.inductor_config['force_disable_caches'] = True
+        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass

    def __call__(self, graph: fx.GraphModule, example_inputs):
+        self.graph_pre_compile = deepcopy(graph)
        from torch._inductor.compile_fx import compile_fx
        return compile_fx(graph,
                          example_inputs,
-                          config_patches=self.current_config)
+                          config_patches=self.inductor_config)

    def post_pass(self, graph: fx.Graph):
        self.graph_pre_pass = deepcopy(graph)

--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+# TEST V1: this should be removed. Right now V1 overrides
+# all the torch compile logic. We should re-enable this
+# as we add torch compile support back to V1.
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional

 import torch
 from torch import nn
@@ -56,7 +56,7 @@ class LlamaConfig:
    random_seed: int = 0

    def compute_hash(self) -> str:
-        factors: List[Any] = []
+        factors: list[Any] = []
        for k, v in self.__dict__.items():
            if k == "random_seed":
                continue
@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        For tractable computation:
        - if residual is None, the outputs are:

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations

 import dataclasses
-from typing import Dict, List, Optional

 import pytest
 import os
@@ -15,7 +15,7 @@ from ..utils import compare_all_settings, models_path_prefix
 @dataclasses.dataclass
 class TestSetting:
    model: str
-    model_args: List[str]
+    model_args: list[str]
    pp_size: int
    tp_size: int
    attn_backend: str
@@ -23,75 +23,76 @@ class TestSetting:
    fullgraph: bool


-# representative settings for testing
-test_settings = [
-    # basic llama model
-    TestSetting(
-        model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-        model_args=[],
-        pp_size=2,
-        tp_size=2,
-        attn_backend="FLASHINFER",
-        method="generate",
-        fullgraph=True,
-    ),
-    # llama model with quantization
-    TestSetting(
-        model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
-        model_args=["--quantization", "gptq"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # MoE model
-    TestSetting(
-        model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
-        model_args=[],
-        pp_size=1,
-        tp_size=2,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # embedding model
-    TestSetting(
-        model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="encode",
-        fullgraph=True,
-    ),
-    # encoder-based embedding model (BERT)
-    TestSetting(
-        model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="XFORMERS",
-        method="encode",
-        fullgraph=True,
-    ),
-    # vision language model
-    TestSetting(
-        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
-        model_args=["--trust-remote-code", "--max-model-len", "2048"],
-        pp_size=2,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate_with_image",
-        fullgraph=False,
-    ),
-]
-
-
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=[],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASHINFER",
+            method="generate",
+            fullgraph=True,
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=[],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
+        # encoder-based embedding model (BERT)
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="XFORMERS",
+            method="encode",
+            fullgraph=True,
+        ),
+        # vision language model
+        TestSetting(
+            model="microsoft/Phi-3.5-vision-instruct",
+            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate_with_image",
+            fullgraph=False,
+        ),
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@@ -104,41 +105,45 @@ def test_compile_correctness(test_setting: TestSetting):
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
-    import os
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
-                ["-tp", str(tp_size)]

-    all_args: List[List[str]] = []
-    all_envs: List[Optional[Dict[str, str]]] = []
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]
+
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []

-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.PIECEWISE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.PIECEWISE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})

-    # inductor will change the output, so we only compare if the output
-    # is close, not exactly the same.
-    compare_all_settings(
-        model,
-        all_args,
-        all_envs,
-        method=method if method != "generate" else "generate_close")
-    all_envs.clear()
-    all_args.clear()
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close")
+        all_envs.clear()
+        all_args.clear()

-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
-        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-            # "DYNAMO_ONCE" will always use fullgraph
-            all_envs[-1][
-                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.DYNAMO_AS_IS,
+                CompilationLevel.DYNAMO_ONCE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+                # "DYNAMO_ONCE" will always use fullgraph
+                all_envs[-1][
+                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore

-    compare_all_settings(model, all_args * 3, all_envs, method=method)
+        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
 # SPDX-License-Identifier: Apache-2.0

+from __future__ import annotations
+
+from typing import Any
+
 import pytest
+import torch

+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
+from vllm.platforms import current_platform
+
+from ..utils import create_new_process_for_each_test
+
+
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+            "quantization": "compressed-tensors"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if is_quant_method_supported("aqlm"):
+        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+            "quantization": "aqlm"
+        }))
+
+    # TODO: figure out why this fails.
+    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+            "quantization": "gguf"
+        }))
+
+    if is_quant_method_supported("gptq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+            "quantization": "gptq"
+        }))

-from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support
+    if is_quant_method_supported("gptq_marlin"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+            "quantization": "gptq_marlin"
+        }))
+
+    if is_quant_method_supported("gptq_marlin_24"):
+        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+            "quantization": "gptq_marlin_24"
+        }))
+
+    if is_quant_method_supported("marlin"):
+        TEST_MODELS.append(
+            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                "quantization": "marlin"
+            }))
+
+    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+            "quantization": "AWQ"
+        }))
+
+    return TEST_MODELS


-@pytest.mark.parametrize("model_info", TEST_MODELS)
 @pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
-@fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1)
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
+@create_new_process_for_each_test()
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            disable_custom_all_reduce=True,
+            compilation_config=optimization_level,
+            **model_kwargs,
+        )
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -10,7 +10,7 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
-from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig

 from .backend import TestBackend
@@ -52,11 +52,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
    torch.set_default_device("cuda")

    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
-                                          enable_reshape=True)
-    reshape_pass = RedundantReshapesPass(config)
+                                          enable_noop=True)
+    noop_pass = NoOpEliminationPass(config)
    fusion_pass = FusionPass.instance(config)

-    passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
+    passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
    func_pass = FixFunctionalizationPass(config)
    backend_func = TestBackend(*passes, func_pass)
    backend_no_func = TestBackend(*passes)

--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -5,23 +5,25 @@ import torch
 from compressed_tensors.quantization import FP8_DTYPE

 import vllm.envs as envs
+import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
                                     FusionPass, QuantKey)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
-from vllm.compilation.reshapes import RedundantReshapesPass
-from vllm.config import CompilationConfig
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)

 from .backend import TestBackend


 class TestModel(torch.nn.Module):

-    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
-                 **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool,
+                 cutlass_fp8_enabled: bool, *args, **kwargs):
        super().__init__(*args, **kwargs)
+        self.cutlass_fp8_enabled = cutlass_fp8_enabled
        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
        if static:
@@ -32,24 +34,20 @@ class TestModel(torch.nn.Module):
            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
            for _ in range(2)
        ]
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_enabled,
+            use_per_token_if_dynamic=True)

    def forward(self, x):
        resid = torch.sqrt(x)
        y = self.norm[0](x)

-        x2 = apply_fp8_linear(y,
-                              self.w[0],
-                              self.wscale[0],
-                              self.scale[0],
-                              use_per_token_if_dynamic=True)
+        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
        # make sure resid is used for replacement to work
        y2, resid = self.norm[1](x2, resid)

-        x3 = apply_fp8_linear(y2,
-                              self.w[1],
-                              self.wscale[1],
-                              self.scale[1],
-                              use_per_token_if_dynamic=True)
+        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
+                                   self.scale[1])
        y3, resid = self.norm[2](x3, resid)  # use resid here
        return y3

@@ -59,60 +57,67 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("cutlass_fp8_enabled",
+                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                    reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
+                              cutlass_fp8_enabled):
    torch.set_default_device("cuda")
    torch.set_default_dtype(dtype)
    torch.manual_seed(1)
+    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths

-    # Reshape pass is needed for the fusion pass to work
-    config = CompilationConfig.PassConfig(enable_fusion=True,
-                                          enable_reshape=True)
-    reshape_pass = RedundantReshapesPass(config)
-    fusion_pass = FusionPass.instance(config)
-
-    backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps, static)
-
-    # First dimension dynamic
-    x = torch.rand(num_tokens, hidden_size)
-    torch._dynamo.mark_dynamic(x, 0)
-
-    result = model(x)
-
-    model2 = torch.compile(model, backend=backend)
-    result2 = model2(x)
-
-    # Higher tol for dynamic, even higher for bfloat16
-    if static:
-        ATOL, RTOL = (1e-3, 1e-3)
-    elif dtype == torch.float16:
-        ATOL, RTOL = (2e-3, 2e-3)
-    else:
-        ATOL, RTOL = (1e-2, 1e-2)
-
-    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
-
-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
-
-    # static is per-tensor, dynamic is per-token
-    key = QuantKey(dtype=FP8_DTYPE,
-                   static=static,
-                   per_tensor=static,
-                   symmetric=True)
-    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
-    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
-    fp8_quant = QUANT_OPS[key]
-
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
-    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
-
-    # In post-nodes, fused kernels should be present and fp8 quant should not
-    find_auto_fn(post_nodes, rms_quant)
-    find_auto_fn(post_nodes, add_rms_quant)
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
+    with vllm.config.set_current_vllm_config(vllm_config):
+        # Reshape pass is needed for the fusion pass to work
+        config = CompilationConfig.PassConfig(enable_fusion=True,
+                                              enable_noop=True)
+        noop_pass = NoOpEliminationPass(config)
+        fusion_pass = FusionPass.instance(config)
+
+        backend = TestBackend(noop_pass, fusion_pass)
+        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+
+        # First dimension dynamic
+        x = torch.rand(num_tokens, hidden_size)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result = model(x)
+
+        model2 = torch.compile(model, backend=backend)
+        result2 = model2(x)
+
+        # Higher tol for dynamic, even higher for bfloat16
+        if static:
+            ATOL, RTOL = (1e-3, 1e-3)
+        elif dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+
+        # Check substitution worked
+        pre_nodes = backend.graph_pre_pass.nodes
+        post_nodes = backend.graph_post_pass.nodes
+
+        # static is per-tensor, dynamic is per-token
+        key = QuantKey(dtype=FP8_DTYPE,
+                       static=static,
+                       per_tensor=static,
+                       symmetric=True)
+        rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+        add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+        fp8_quant = QUANT_OPS[key]
+
+        # In pre-nodes, fp8 quant should be there and fused kernels should not
+        assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
+        assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
+        find_auto_fn(pre_nodes, fp8_quant)
+
+        # In post-nodes, fused kernels should be there and fp8 quant should not
+        find_auto_fn(post_nodes, rms_quant)
+        find_auto_fn(post_nodes, add_rms_quant)
+        assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
 # SPDX-License-Identifier: Apache-2.0
-
-import pickle
+import copy

 import pytest
 import torch
-from torch._inductor.codecache import BypassFxGraphCache

-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.inductor_pass import (CallableInductorPass,
-                                            as_inductor_pass)
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
+from vllm.config import CompilationConfig


+# dummy custom pass that doesn't inherit
 def simple_callable(graph: torch.fx.Graph):
    pass


-@as_inductor_pass(files=(__file__, ))
-def callable_decorated(graph: torch.fx.Graph):
-    pass
+# Should fail to add directly to the pass manager
+def test_bad_callable():
+    config = CompilationConfig().pass_config
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    with pytest.raises(AssertionError):
+        pass_manager.add(simple_callable)  # noqa, type wrong on purpose
+
+
+# Pass that inherits from InductorPass
+class ProperPass(InductorPass):
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        pass


 @pytest.mark.parametrize(
-    "works, callable",
-    [(False, simple_callable), (True, callable_decorated),
-     (True, CallableInductorPass(simple_callable, "simple_callable"))])
-def test_pass_manager(works: bool, callable):
+    "callable",
+    [
+        ProperPass(),
+        # Can also wrap callables in CallableInductorPass for compliance
+        CallableInductorPass(simple_callable),
+        CallableInductorPass(simple_callable,
+                             InductorPass.hash_source(__file__))
+    ],
+)
+def test_pass_manager_uuid(callable):
    config = CompilationConfig().pass_config
-    pass_manager = PostGradPassManager([callable])
-    pass_manager.configure(config)  # Adds default passes
-
-    if works:
-        pickle.dumps(pass_manager)
-    else:
-        with pytest.raises(BypassFxGraphCache):
-            pickle.dumps(pass_manager)
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Check that UUID is different if the same pass is added 2x
+    pass_manager.add(callable)
+    uuid1 = pass_manager.uuid()
+    pass_manager.add(callable)
+    uuid2 = pass_manager.uuid()
+    assert uuid1 != uuid2
+
+    # UUID should be the same as the original one,
+    # as we constructed in the same way.
+    pass_manager2 = PostGradPassManager()
+    pass_manager2.configure(config)
+    pass_manager2.add(callable)
+    assert uuid1 == pass_manager2.uuid()
+
+    # UUID should be different due to config change
+    config2 = copy.deepcopy(config)
+    config2.enable_fusion = not config2.enable_fusion
+    pass_manager3 = PostGradPassManager()
+    pass_manager3.configure(config2)
+    pass_manager3.add(callable)
+    assert uuid1 != pass_manager3.uuid()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,8 @@ import os
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
-                    TypedDict, TypeVar, Union)
+
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
 import pytest
 import pytest_html

@@ -17,8 +17,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from tests.models.utils import (TokensTextLogprobs,
@@ -26,7 +26,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@@ -37,8 +37,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+
+from vllm.utils import cuda_device_count_stateless, is_list_of
 from .utils import models_path_prefix

 logger = init_logger(__name__)
@@ -50,78 +50,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

 _M = TypeVar("_M")

-MODELS_ON_S3 = [
-    "distilbert/distilgpt2",
-    "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Meta-Llama-3-8B",
-    "meta-llama/Llama-3.2-1B",
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "openai-community/gpt2",
-    "ArthurZ/Ilama-3.2-1B",
-    "llava-hf/llava-1.5-7b-hf",
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "ai21labs/Jamba-tiny-random",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
-    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
-    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
-    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
-    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
-    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
-    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
-]
-
-MODEL_WEIGHTS_S3_BUCKET = models_path_prefix
-
-_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+_PromptMultiModalInput = Union[list[_M], list[list[_M]]]

 PromptImageInput = _PromptMultiModalInput[Image.Image]
-PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]


-def _read_prompts(filename: str) -> List[str]:
+def _read_prompts(filename: str) -> list[str]:
    with open(filename) as f:
        prompts = f.readlines()
        return prompts
@@ -144,7 +80,7 @@ class _ImageAssets(_ImageAssetsBase):
            ImageAsset("cherry_blossom"),
        ])

-    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
        """
        Convenience method to define the prompt for each test image.

@@ -169,7 +105,7 @@ class _VideoAssets(_VideoAssetsBase):
            VideoAsset("sample_demo_1.mp4"),
        ])

-    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
        return [prompts["sample_demo_1"]]


@@ -179,6 +115,26 @@ VIDEO_ASSETS = _VideoAssets()
 """Singleton instance of :class:`_VideoAssets`."""


+@pytest.fixture(scope="function", autouse=True)
+def cleanup_VLLM_USE_V1(monkeypatch):
+    """
+    The V1 oracle sets "VLLM_USE_V1" during loading. This means
+    that each invocation of a test change the env variable.
+
+    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
+    made during the test run by vLLM will be cleaned up.
+
+    This fixture is used by every test.
+    """
+
+    # If VLLM_USE_V1 is not set, set then delete. This will
+    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
+    # if VLLM modifies the value of envs.VLLM_USE_V1.
+    if "VLLM_USE_V1" not in os.environ:
+        monkeypatch.setenv("VLLM_USE_V1", "")
+        monkeypatch.delenv("VLLM_USE_V1")
+
+
 @pytest.fixture(params=[True, False])
 def run_with_both_engines(request, monkeypatch):
    # Automatically runs tests twice, once with V1 and once without
@@ -242,7 +198,7 @@ def dynamo_reset():


 @pytest.fixture
-def example_prompts() -> List[str]:
+def example_prompts() -> list[str]:
    prompts = []
    for filename in _TEST_PROMPTS:
        prompts += _read_prompts(filename)
@@ -264,7 +220,7 @@ class DecoderPromptType(Enum):

 @pytest.fixture
 def example_encoder_decoder_prompts(
-) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
+) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
@@ -296,7 +252,7 @@ def example_encoder_decoder_prompts(


 @pytest.fixture
-def example_long_prompts() -> List[str]:
+def example_long_prompts() -> list[str]:
    prompts = []
    for filename in _LONG_PROMPTS:
        prompts += _read_prompts(filename)
@@ -319,13 +275,17 @@ _R = TypeVar("_R")

 class HfRunner:

-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
        from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
        if x is None or isinstance(x, (bool, )):
            return x

        if device is None:
-            device = "cpu" if current_platform.is_cpu() else "cuda"
+            device = self.device

        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
@@ -338,45 +298,59 @@ class HfRunner:
    def __init__(
        self,
        model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
        *,
-        model_kwargs: Optional[Dict[str, Any]] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
-        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
    ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
        self.model_name = model_name

+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
        if is_sentence_transformer:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                    trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
        else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                    **model_kwargs,
-                ))
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                **model_kwargs,
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model

        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
@@ -396,16 +370,13 @@ class HfRunner:
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer

-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
    def get_inputs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
        if images is not None:
            assert len(prompts) == len(images)

@@ -415,9 +386,9 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: List[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
        for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                "text": prompt,
                "return_tensors": "pt",
            }
@@ -431,13 +402,14 @@ class HfRunner:
                processor_kwargs["sampling_rate"] = sr

            inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)

            all_inputs.append(inputs)

        return all_inputs

-    def classify(self, prompts: List[str]) -> List[str]:
+    def classify(self, prompts: list[str]) -> list[str]:
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
@@ -450,21 +422,21 @@ class HfRunner:

    def generate(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
        for inputs in all_inputs:
            output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                **kwargs,
            )
@@ -479,13 +451,13 @@ class HfRunner:

    def generate_greedy(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
@@ -499,10 +471,10 @@ class HfRunner:

    def generate_beam_search(
        self,
-        prompts: List[str],
+        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
@@ -520,22 +492,22 @@ class HfRunner:

    def generate_greedy_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

-        all_logprobs: List[List[torch.Tensor]] = []
+        all_logprobs: list[list[torch.Tensor]] = []
        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -550,11 +522,11 @@ class HfRunner:

    def _hidden_states_to_seq_logprobs(
        self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
-    ) -> List[torch.Tensor]:
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
+    ) -> list[torch.Tensor]:
        output_embeddings = self.model.get_output_embeddings()

-        seq_logprobs: List[torch.Tensor] = []
+        seq_logprobs: list[torch.Tensor] = []
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
@@ -570,14 +542,14 @@ class HfRunner:

    def _hidden_states_to_logprobs(
        self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
        num_logprobs: int,
-    ) -> Tuple[List[Dict[int, float]], int]:
+    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

        # convert to dict
-        seq_logprobs_lst: List[Dict[int, float]] = []
+        seq_logprobs_lst: list[dict[int, float]] = []
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
@@ -597,26 +569,26 @@ class HfRunner:

    def generate_greedy_logprobs_limit(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

-        all_logprobs: List[List[Dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
-        all_output_strs: List[str] = []
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []

        for inputs in all_inputs:
            output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@@ -644,51 +616,47 @@ class HfRunner:

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

-        all_logprobs: List[List[Dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
-        all_output_strs: List[str] = []
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []

        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]

-            encoder_input_ids = self.wrap_device(
-                self.processor(**processor_kwargs).input_ids,
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)

            output = self.model.generate(
-                encoder_input_ids,
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
+                **encoder_inputs,
                **kwargs,
            )

@@ -708,10 +676,10 @@ class HfRunner:
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

-    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
+    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts)

-    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
+    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
        return self.model.predict(prompts, convert_to_tensor=True)

    def __enter__(self):
@@ -728,6 +696,18 @@ def hf_runner():


 class VllmRunner:
+    """
+    The default value of some arguments have been modified from
+    :class:`~vllm.LLM` as follows:
+
+    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
+    - `seed`: Set to `0` instead of `None` for test reproducibility.
+    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
+    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
+    - `enable_chunked_prefill`: Set to `False` instead of `None` for
+      test reproducibility.
+    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
+    """

    def __init__(
        self,
@@ -735,31 +715,26 @@ class VllmRunner:
        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
+        trust_remote_code: bool = True,
+        seed: Optional[int] = 0,
        max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
-        enable_chunked_prefill: bool = False,
+        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
-        load_format: Optional[LoadFormat] = None,
        **kwargs,
    ) -> None:
-        if model_name in MODELS_ON_S3 and not load_format:
-            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
-            load_format = LoadFormat.RUNAI_STREAMER
-        if not load_format:
-            load_format = LoadFormat.AUTO
        self.model = LLM(
            model=model_name,
            task=task,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
            dtype=dtype,
+            seed=seed,
            swap_space=swap_space,
            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
@@ -767,17 +742,16 @@ class VllmRunner:
            max_model_len=max_model_len,
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
-            load_format=load_format,
            **kwargs,
        )

    def get_inputs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[TextPrompt]:
+    ) -> list[TextPrompt]:
        if images is not None:
            assert len(prompts) == len(images)

@@ -807,13 +781,13 @@ class VllmRunner:

    def generate(
        self,
-        prompts: List[str],
+        prompts: list[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -823,12 +797,12 @@ class VllmRunner:
                                          sampling_params=sampling_params,
                                          **kwargs)

-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids: List[List[int]] = []
-            req_sample_output_strs: List[str] = []
+            req_sample_output_ids: list[list[int]] = []
+            req_sample_output_strs: list[str] = []
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = list(sample.token_ids)
@@ -839,9 +813,9 @@ class VllmRunner:

    @staticmethod
    def _final_steps_generate_w_logprobs(
-        req_outputs: List[RequestOutput],
-    ) -> List[TokensTextLogprobsPromptLogprobs]:
-        outputs: List[TokensTextLogprobsPromptLogprobs] = []
+        req_outputs: list[RequestOutput],
+    ) -> list[TokensTextLogprobsPromptLogprobs]:
+        outputs: list[TokensTextLogprobsPromptLogprobs] = []
        for req_output in req_outputs:
            assert len(req_output.outputs) > 0
            for sample in req_output.outputs:
@@ -854,14 +828,14 @@ class VllmRunner:

    def generate_w_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -880,10 +854,10 @@ class VllmRunner:

    def generate_encoder_decoder_w_logprobs(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        sampling_params: SamplingParams,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''
@@ -900,13 +874,13 @@ class VllmRunner:

    def generate_greedy(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts,
                                greedy_params,
@@ -919,18 +893,18 @@ class VllmRunner:

    def generate_greedy_logprobs(
        self,
-        prompts: List[str],
+        prompts: list[str],
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[List[int]] = None,
-        stop: Optional[List[str]] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        stop: Optional[list[str]] = None,
        **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@@ -948,12 +922,12 @@ class VllmRunner:

    def generate_encoder_decoder_greedy_logprobs(
        self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
        max_tokens: int,
        num_logprobs: int,
        num_prompt_logprobs: Optional[int] = None,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@@ -969,10 +943,10 @@ class VllmRunner:

    def generate_beam_search(
        self,
-        prompts: Union[List[str], List[List[int]]],
+        prompts: Union[list[str], list[list[int]]],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
        if is_list_of(prompts, str, check="all"):
            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
        else:
@@ -989,17 +963,17 @@ class VllmRunner:
            returned_outputs.append((token_ids, texts))
        return returned_outputs

-    def classify(self, prompts: List[str]) -> List[List[float]]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

    def encode(
        self,
-        prompts: List[str],
+        prompts: list[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
-    ) -> List[List[float]]:
+    ) -> list[list[float]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
@@ -1010,9 +984,9 @@ class VllmRunner:

    def score(
        self,
-        text_1: Union[str, List[str]],
-        text_2: Union[str, List[str]],
-    ) -> List[float]:
+        text_1: Union[str, list[str]],
+        text_2: Union[str, list[str]],
+    ) -> list[float]:
        req_outputs = self.model.score(text_1, text_2)
        return [req_output.outputs.score for req_output in req_outputs]