Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -33,6 +33,7 @@ num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
+mteb>=1.38.11, <2 # required for mteb test
 transformers==4.51.3
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -99,6 +99,7 @@ datasets==3.0.2
    # via
    #   evaluate
    #   lm-eval
+    #   mteb
 decorator==5.1.1
    # via librosa
 dill==0.3.8
@@ -124,6 +125,8 @@ email-validator==2.2.0
    # via pydantic
 encodec==0.1.1
    # via vocos
+eval-type-backport==0.2.2
+    # via mteb
 evaluate==0.4.3
    # via lm-eval
 fastparquet==2024.11.0
@@ -291,6 +294,8 @@ msgpack==1.1.0
    # via
    #   librosa
    #   ray
+mteb==1.38.11
+    # via -r requirements/test.in
 multidict==6.1.0
    # via
    #   aiohttp
@@ -331,6 +336,7 @@ numpy==1.26.4
    #   librosa
    #   matplotlib
    #   mistral-common
+    #   mteb
    #   numba
    #   numexpr
    #   opencv-python-headless
@@ -443,6 +449,8 @@ plotly==5.24.1
    # via genai-perf
 pluggy==1.5.0
    # via pytest
+polars==1.29.0
+    # via mteb
 pooch==1.8.2
    # via librosa
 portalocker==2.10.1
@@ -476,6 +484,7 @@ pydantic==2.9.2
    # via
    #   datamodel-code-generator
    #   mistral-common
+    #   mteb
 pydantic-core==2.23.4
    # via pydantic
 pygments==2.18.0
@@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
    #   typepy
 python-rapidjson==1.20
    # via tritonclient
+pytrec-eval-terrier==0.5.7
+    # via mteb
 pytz==2024.2
    # via
    #   pandas
@@ -564,6 +575,7 @@ requests==2.32.3
    #   huggingface-hub
    #   lm-eval
    #   mistral-common
+    #   mteb
    #   pooch
    #   ray
    #   responses
@@ -580,6 +592,7 @@ rfc3987==1.3.8
 rich==13.9.4
    # via
    #   genai-perf
+    #   mteb
    #   typer
 rouge-score==0.1.2
    # via lm-eval
@@ -607,16 +620,20 @@ scikit-learn==1.5.2
    # via
    #   librosa
    #   lm-eval
+    #   mteb
    #   sentence-transformers
 scipy==1.13.1
    # via
    #   librosa
+    #   mteb
    #   scikit-learn
    #   sentence-transformers
    #   statsmodels
    #   vocos
 sentence-transformers==3.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   mteb
 sentencepiece==0.2.0
    # via mistral-common
 setuptools==77.0.3
@@ -696,6 +713,7 @@ torch==2.7.0+cu128
    #   fastsafetensors
    #   lm-eval
    #   mamba-ssm
+    #   mteb
    #   peft
    #   runai-model-streamer
    #   sentence-transformers
@@ -720,6 +738,7 @@ tqdm==4.66.6
    #   evaluate
    #   huggingface-hub
    #   lm-eval
+    #   mteb
    #   nltk
    #   peft
    #   pqdm
@@ -759,6 +778,7 @@ typing-extensions==4.12.2
    #   huggingface-hub
    #   librosa
    #   mistral-common
+    #   mteb
    #   pqdm
    #   pydantic
    #   pydantic-core

--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250430
-torchvision==0.22.0.dev20250430
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250518
+torchvision==0.22.0.dev20250518
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

--- a/setup.py
+++ b/setup.py
@@ -5,12 +5,12 @@ import importlib.util
 import json
 import logging
 import os
-import re
 import subprocess
 import sys
 from pathlib import Path
 from shutil import which

+import regex as re
 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
@@ -389,7 +389,6 @@ class repackage_wheel(build_ext):
            # vllm_flash_attn python code:
            # Regex from
            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
-            import re
            compiled_regex = re.compile(
                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
            file_members += list(

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,12 +8,13 @@ import weakref
 from unittest.mock import Mock

 import pytest
+import torch

-from vllm import LLM
+from vllm import LLM, envs
 from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1

-from ..conftest import VllmRunner
+from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test

@@ -43,11 +44,26 @@ def test_vllm_gc_ed():
    assert weak_llm() is None


+def _fix_prompt_embed_outputs(
+        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
+        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    fixed_vllm_outputs = []
+    for vllm_output, hf_input, prompt in zip(
+            vllm_outputs, hf_model.get_inputs(example_prompts),
+            example_prompts):
+        hf_input_ids = hf_input["input_ids"].tolist()[0]
+        fixed_vllm_outputs.append(
+            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
+             prompt + vllm_output[1]))
+    return fixed_vllm_outputs
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
@@ -56,8 +72,13 @@ def test_models(
    dtype: str,
    max_tokens: int,
    enforce_eager: bool,
+    enable_prompt_embeds: bool,
 ) -> None:

+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")

@@ -78,14 +99,25 @@ def test_models(

        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            if enable_prompt_embeds:
+                with torch.no_grad():
+                    prompt_embeds = hf_model.get_prompt_embeddings(
+                        example_prompts)

        with VllmRunner(model,
                        max_model_len=8192,
                        dtype=dtype,
                        enforce_eager=enforce_eager,
+                        enable_prompt_embeds=enable_prompt_embeds,
                        gpu_memory_utilization=0.7) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
+            if enable_prompt_embeds:
+                vllm_outputs = vllm_model.generate_greedy(
+                    prompt_embeds, max_tokens)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)

        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@@ -108,6 +140,7 @@ def test_models(
        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
@@ -117,14 +150,22 @@ def test_models_distributed(
    distributed_executor_backend: str,
    attention_backend: str,
    test_suite: str,
+    enable_prompt_embeds: bool,
 ) -> None:

+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")

    with monkeypatch.context() as monkeypatch_context:
        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-            # test Ray Compiled Graph
+            if enable_prompt_embeds:
+                pytest.skip(
+                    "enable_prompt_embeds does not work with ray compiled dag."
+                )
            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")

@@ -147,12 +188,26 @@ def test_models_distributed(
                dtype=dtype,
                tensor_parallel_size=2,
                distributed_executor_backend=distributed_executor_backend,
+                enable_prompt_embeds=enable_prompt_embeds,
+                gpu_memory_utilization=0.7,
        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
+            if enable_prompt_embeds:
                with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+                    with torch.no_grad():
+                        prompt_embeds = hf_model.get_prompt_embeddings(
+                            example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(
+                        prompt_embeds, max_tokens)
+                    vllm_outputs = _fix_prompt_embed_outputs(
+                        vllm_outputs, hf_model, example_prompts)
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)
+                with hf_runner(model, dtype=dtype) as hf_model:
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=hf_outputs,

--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -5,6 +5,8 @@ from typing import Callable, Union

 from torch import fx

+from vllm.compilation.fx_utils import (find_specified_fn,
+                                       find_specified_fn_maybe)
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.config import get_current_vllm_config

@@ -44,3 +46,19 @@ class TestBackend:
        self.graph_post_pass = deepcopy(graph)
        # assign by reference, will reflect the final state of the graph
        self.final_graph = graph
+
+    def check_before_ops(self, ops,
+                         find_fn=find_specified_fn, \
+                         find_fn_maybe=find_specified_fn_maybe, \
+                        ops_fully_replaced=True):
+        for op in ops:
+            find_fn(self.graph_pre_pass.nodes, op)
+            if ops_fully_replaced:
+                assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
+
+    def check_after_ops(self, ops,
+                        find_fn=find_specified_fn, \
+                        find_fn_maybe=find_specified_fn_maybe):
+        for op in ops:
+            find_fn(self.graph_post_pass.nodes, op)
+            assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.collective_fusion import AsyncTPPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         PassConfig, VllmConfig)
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import (compare_two_settings, create_new_process_for_each_test,
+                     multi_gpu_test)
+from .backend import TestBackend
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class TestMMRSModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.gate_proj = torch.nn.Parameter(torch.empty(
+            (self.hidden_size * 2, hidden_size)),
+                                            requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + reduce scatter in the FX graph
+    
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        # matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default]
+
+
+class TestAGMMModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.weight = torch.nn.Parameter(torch.empty(
+            (hidden_size, hidden_size)),
+                                         requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + all gather in the FX graph
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_gather = tensor_model_parallel_all_gather(view, dim=0)
+        permute = self.weight.permute(1, 0)
+        mm = torch.mm(all_gather, permute)
+        return mm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_matmul.default]
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
+                               hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, test_model,
+                                          batch_size, seq_len, hidden_size,
+                                          dtype),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(async_tp_pass_on_test_model, num_processes)
+
+
+def async_tp_pass_on_test_model(local_rank: int, world_size: int,
+                                test_model_cls: torch.nn.Module,
+                                batch_size: int, seq_len: int,
+                                hidden_size: int, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
+        enable_async_tp=True, ), )
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model_name,
+                                           task="auto",
+                                           tokenizer=model_name,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    async_tp_pass = AsyncTPPass(vllm_config)
+    backend = TestBackend(async_tp_pass)
+
+    model = test_model_cls(hidden_size)
+
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype,
+                                requires_grad=False)
+
+    compiled_model = torch.compile(model, backend=backend)
+    compiled_model(hidden_states)
+
+    # In pre-nodes, all gather or reduce scatter should exist,
+    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+    backend.check_before_ops(model.ops_in_model_before(),
+                             ops_fully_replaced=False)
+
+    # In post-nodes, fused_matmul_reduce_scatter or \
+    # fused_all_gather_matmul should exist
+    backend.check_after_ops(model.ops_in_model_after())
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("async_tp_enabled", [True])
+@pytest.mark.parametrize("distributed_backend", ["mp"])
+@pytest.mark.parametrize("eager_mode", [False, True])
+def test_async_tp_pass_correctness(
+    model_id: str,
+    tp_size: int,
+    async_tp_enabled: bool,
+    distributed_backend: str,
+    eager_mode: bool,
+    num_gpus_available: int,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
+
+    pp_size = 1
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if eager_mode:
+        common_args.append("--enforce-eager")
+
+    compilation_config = {
+        'level': 3,
+        'compile_sizes': [2, 4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_async_tp': async_tp_enabled
+        },
+    }
+
+    async_tp_env = tp_env = {
+        "VLLM_USE_V1": "1",
+    }
+
+    aysnc_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(model_id,
+                         aysnc_tp_args,
+                         tp_args,
+                         async_tp_env,
+                         tp_env,
+                         method="generate")
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -29,6 +29,10 @@ class TestModel(torch.nn.Module):
        self.cutlass_fp8_enabled = cutlass_fp8_enabled
        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        self.key = QuantKey(dtype=FP8_DTYPE,
+                            static=static,
+                            per_tensor=static,
+                            symmetric=True)
        if static:
            self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
        else:
@@ -59,6 +63,15 @@ class TestModel(torch.nn.Module):
        y3, resid = self.norm[2](x3, resid)  # use resid here
        return y3

+    def ops_in_model_before(self):
+        return [QUANT_OPS[self.key]]
+
+    def ops_in_model_after(self):
+        return [
+            FUSED_OPS[FusedRMSQuantKey(self.key, False)],
+            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+        ]
+

 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
@@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,

        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)

-        # Check substitution worked
-        pre_nodes = backend.graph_pre_pass.nodes
-        post_nodes = backend.graph_post_pass.nodes
-
-        # static is per-tensor, dynamic is per-token
-        key = QuantKey(dtype=FP8_DTYPE,
-                       static=static,
-                       per_tensor=static,
-                       symmetric=True)
-        rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
-        add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
-        fp8_quant = QUANT_OPS[key]
-
        # In pre-nodes, fp8 quant should be there and fused kernels should not
-        assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
-        assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
-        find_auto_fn(pre_nodes, fp8_quant)
+        backend.check_before_ops(model.ops_in_model_before(), find_auto_fn,
+                                 find_auto_fn_maybe)

        # In post-nodes, fused kernels should be there and fp8 quant should not
-        find_auto_fn(post_nodes, rms_quant)
-        find_auto_fn(post_nodes, add_rms_quant)
-        assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+        backend.check_after_ops(model.ops_in_model_after(), find_auto_fn,
+                                find_auto_fn_maybe)
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -5,9 +5,7 @@ import torch

 import vllm.envs as envs
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
-                                       find_specified_fn,
-                                       find_specified_fn_maybe, is_func)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
                         PassConfig, VllmConfig)
@@ -21,17 +19,6 @@ from vllm.utils import update_environment_variables
 from ..utils import multi_gpu_test
 from .backend import TestBackend

-OPS_IN_MODEL_BEFORE = [
-    torch.ops.vllm.all_reduce.default,
-]
-
-OPS_IN_MODEL_AFTER = [
-    torch.ops.vllm.reduce_scatter.default,
-    torch.ops.vllm.all_gather.default,
-]
-
-OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
-
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@@ -78,6 +65,18 @@ class TestModel(torch.nn.Module):

        return norm_output, residual_output

+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.reduce_scatter.default,
+            torch.ops.vllm.all_gather.default
+        ]
+
+    def ops_in_model(self):
+        return [torch.ops._C.fused_add_rms_norm.default]
+

 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("batch_size", [8])
@@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
    compiled_model_func = torch.compile(model, backend=backend_func)
    compiled_model_func(hidden_states, residual)

-    # Check substitution worked
-    pre_nodes = backend_no_func.graph_pre_pass.nodes
-    post_nodes = backend_no_func.graph_post_pass.nodes
-
    # In pre-nodes, all reduce should be there,
    # reduce scatter and all gather should not
-    for op in OPS_IN_MODEL_BEFORE:
-        find_specified_fn(pre_nodes, op)
-    for op in OPS_IN_MODEL_AFTER:
-        assert find_specified_fn_maybe(pre_nodes, op) is None
+    backend_no_func.check_before_ops(model.ops_in_model_before())

    # In post-nodes, reduce scatter and all gather should be there,
    # all reduce should not
-    for op in OPS_IN_MODEL_AFTER:
-        find_specified_fn(post_nodes, op)
-    for op in OPS_IN_MODEL_BEFORE:
-        assert find_specified_fn_maybe(post_nodes, op) is None
+    backend_no_func.check_after_ops(model.ops_in_model_after())

    # check if the functionalization pass is applied
-    for op in OPS_IN_MODEL:
+    for op in model.ops_in_model():
        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
                                  op) is None  # noqa: E501
@@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
    # make sure the ops were all de-functionalized
    found = dict()
    for node in backend_func.graph_post_pass.nodes:
-        for op in OPS_IN_MODEL:
+        for op in model.ops_in_model():
            if is_func(node, op):
                found[op] = True
-    assert all(found[op] for op in OPS_IN_MODEL)
+    assert all(found[op] for op in model.ops_in_model())
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -430,6 +430,15 @@ class HfRunner:

        return all_inputs

+    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
+        all_inputs = self.get_inputs(prompts)
+        embeddings = []
+        for inputs in all_inputs:
+            input_ids = self.wrap_device(inputs)["input_ids"]
+            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
+            embeddings.append(embedding)
+        return embeddings
+
    def classify(self, prompts: list[str]) -> list[str]:
        # output is final logits
        all_inputs = self.get_inputs(prompts)

--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config):
    """
    publisher_config.replay_endpoint = None

-    cfg = publisher_config.model_copy()
-    cfg.topic = "foo"
-    pub = EventPublisherFactory.create(cfg)
+    publisher_config.topic = "foo"
+    pub = EventPublisherFactory.create(publisher_config)

    from .conftest import MockSubscriber
-    sub_foo = MockSubscriber(cfg.endpoint, None, "foo")
-    sub_bar = MockSubscriber(cfg.endpoint, None, "bar")
+    sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
+    sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")

    try:
        time.sleep(0.1)

--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -9,7 +9,7 @@ import torch.distributed as dist

 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import get_ip, get_open_port, update_environment_variables
+from vllm.utils import get_open_port, update_environment_variables


 def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
@@ -60,12 +60,12 @@ def worker_fn():
    rank = dist.get_rank()
    if rank == 0:
        port = get_open_port()
-        ip = get_ip()
+        ip = '127.0.0.1'
        dist.broadcast_object_list([ip, port], src=0)
    else:
        recv = [None, None]
        dist.broadcast_object_list(recv, src=0)
-        ip, port = recv
+        ip, port = recv  # type: ignore

    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
                                                dist.get_world_size())
@@ -107,10 +107,10 @@ def worker_fn():

        if pg == dist.group.WORLD:
            dist.barrier()
-            print("torch distributed passed the test!")
+            print(f"torch distributed passed the test! Rank {rank}")
        else:
            pg.barrier()
-            print("StatelessProcessGroup passed the test!")
+            print(f"StatelessProcessGroup passed the test! Rank {rank}")


 def test_shm_broadcast():

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
 # SPDX-License-Identifier: Apache-2.0

 import json
-import re
 import weakref
 from enum import Enum

 import jsonschema
 import pytest
+import regex as re
 from pydantic import BaseModel

 from vllm.distributed import cleanup_dist_env_and_memory

--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
+                                                      OpenAIClientMtebEncoder,
+                                                      run_mteb_embed_task,
+                                                      run_mteb_embed_task_st)
+from tests.utils import RemoteOpenAIServer
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "BAAI/bge-m3"
+DTYPE = "float16"
+MAIN_SCORE = 0.7873427091972599
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task", "embed", "--dtype", DTYPE, "--enforce-eager",
+        "--max-model-len", "512"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb(server):
+    client = server.get_client()
+    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
+    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
+    st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
+        MODEL_NAME, MTEB_EMBED_TASKS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -2,13 +2,13 @@

 # imports for guided decoding tests
 import json
-import re
 from typing import Optional

 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 import requests
 import torch
 from openai import BadRequestError, OpenAI

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
 # SPDX-License-Identifier: Apache-2.0
-
 # imports for guided decoding tests
 import json
-import re
 import shutil
 from tempfile import TemporaryDirectory
 from typing import Optional
@@ -11,6 +9,7 @@ import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError

--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
 # SPDX-License-Identifier: Apache-2.0
+from typing import Final
+
 import pytest
 import schemathesis
+from hypothesis import settings
 from schemathesis import GenerationConfig

 from ...utils import RemoteOpenAIServer
@@ -9,6 +12,8 @@ schemathesis.experimental.OPEN_API_3_1.enable()

 MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
 MAXIMUM_IMAGES = 2
+DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
+LONG_TIMEOUT_SECONDS: Final[int] = 60


 @pytest.fixture(scope="module")
@@ -42,8 +47,58 @@ def get_schema(server):
 schema = schemathesis.from_pytest_fixture("get_schema")


+@schemathesis.hook
+def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
+    op = context.operation
+    assert op is not None
+
+    def no_file_type(case: schemathesis.models.Case):
+        """
+        This filter skips test cases for the `POST /tokenize` endpoint where the
+        HTTP request body uses `"type": "file"` in any message's content.
+        We expect these cases to fail because that type isn't implemented here
+        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+
+        Example test cases that are skipped:
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+        """  # noqa: E501
+        if (op.method.lower() == "post" and op.path == "/tokenize"
+                and hasattr(case, "body") and isinstance(case.body, dict)
+                and "messages" in case.body
+                and isinstance(case.body["messages"], list)
+                and len(case.body["messages"]) > 0):
+            for message in case.body["messages"]:
+                if not isinstance(message, dict):
+                    continue
+                content = message.get("content", [])
+                if not isinstance(content, list) or len(content) == 0:
+                    continue
+                if any(item.get("type") == "file" for item in content):
+                    return False
+        return True
+
+    return strategy.filter(no_file_type)
+
+
 @schema.parametrize()
 @schema.override(headers={"Content-Type": "application/json"})
+@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
 def test_openapi_stateless(case: schemathesis.Case):
+    key = (
+        case.operation.method.upper(),
+        case.operation.path,
+    )
+    timeout = {
+        # requires a longer timeout
+        ("POST", "/v1/chat/completions"):
+        LONG_TIMEOUT_SECONDS,
+    }.get(key, DEFAULT_TIMEOUT_SECONDS)
+
    #No need to verify SSL certificate for localhost
-    case.call_and_validate(verify=False)
+    case.call_and_validate(verify=False, timeout=timeout)
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
 # SPDX-License-Identifier: Apache-2.0

 # imports for guided decoding tests
-import re
-
 import openai
 import pytest
+import regex as re

 from ...utils import RemoteOpenAIServer

@@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids():
        client = remote_server.get_async_client()

        with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.*out of vocabulary.*')):
+                           match=re.compile('.*out of vocabulary.*').pattern):
            await client.completions.create(model=model_name,
                                            prompt=[999999],
                                            max_tokens=5,
@@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding():
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

-        with pytest.raises(openai.BadRequestError,
+        with pytest.raises(
+                openai.BadRequestError,
                match=re.compile(
-                               '.*Guided decoding .* multi-step decoding.*')):
+                    '.*Guided decoding .* multi-step decoding.*').pattern):
            await client.completions.create(
                model=model_name,
                prompt="Hello",

--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
 # SPDX-License-Identifier: Apache-2.0
-
-import math
 from typing import Any

 import pytest
@@ -92,7 +90,7 @@ class TestModel:
        hf_outputs = run_transformers(runner, model, text_pairs)

        for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)

    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
                                     model: dict[str, Any], runner):
@@ -124,7 +122,7 @@ class TestModel:
        hf_outputs = run_transformers(runner, model, text_pairs)

        for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)

    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
                                   model: dict[str, Any], runner):
@@ -150,7 +148,7 @@ class TestModel:
        hf_outputs = run_transformers(runner, model, text_pairs)

        for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)

    def test_score_max_model_len(self, server: RemoteOpenAIServer,
                                 model: dict[str, Any]):

--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import json
+import tempfile
+
+import openai
+import pytest
+import pytest_asyncio
+import torch.cuda
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
+LORA_PATH = "davzoku/finqa_adapter_1b"
+
+
+def _cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    _cleanup()
+
+
+@pytest.fixture(scope='module')
+def tmp_dir():
+    with tempfile.TemporaryDirectory() as path:
+        yield path
+
+
+@pytest.fixture(scope='module')
+def model_uri(tmp_dir):
+    yield f"{tmp_dir}/model.tensors"
+
+
+@pytest.fixture(scope="module")
+def tensorize_model_and_lora(tmp_dir, model_uri):
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
+                                         lora_dir=tmp_dir)
+    args = EngineArgs(model=MODEL_NAME, device="cuda")
+
+    tensorize_lora_adapter(LORA_PATH, tensorizer_config)
+    tensorize_vllm_model(args, tensorizer_config)
+
+    # Manually invoke a _cleanup() here, as the cleanup()
+    # fixture won't be guaranteed to be called after this
+    # when this fixture is used for a test
+    _cleanup()
+    yield
+
+
+@pytest.fixture(scope="module")
+def server(model_uri, tensorize_model_and_lora):
+    model_loader_extra_config = {
+        "tensorizer_uri": model_uri,
+    }
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format", "tensorizer", "--device", "cuda",
+        "--model-loader-extra-config",
+        json.dumps(model_loader_extra_config), "--enable-lora"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    _cleanup()
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.model == MODEL_NAME
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)