sync v0.15.1(tests)

78c1f9e5 · zhuwenwen · 86a65417 · 78c1f9e5 · 78c1f9e5 · 78c1f9e5
Commit 78c1f9e5 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -74,7 +74,7 @@ class TestSetting:
        ),
        pytest.param(
            TestSetting(
-                model="BAAI/bge-base-en-v1.5",
+                model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
                model_args=["--runner", "pooling"],
                pp_size=1,
                tp_size=1,

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-import logging
 from contextlib import nullcontext
 from unittest.mock import MagicMock, patch
@@ -19,7 +18,6 @@ from vllm.config import (
 )
 from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
-from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import (
    _is_torch_equal_or_newer,

--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -42,7 +42,6 @@ class MockModelConfig:
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
-    hf_text_config = MockHFConfig()
    logits_processor_pattern = None
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import os
-from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-from vllm.tokenizers import get_tokenizer
-from ...models.registry import HF_EXAMPLE_MODELS
-from ...utils import VLLM_PATH, models_path_prefix
-chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
-assert chatml_jinja_path.exists()
-# Define models, templates, and their corresponding expected outputs
-MODEL_TEMPLATE_GENERATION_OUTPUT = [
-    (
-        os.path.join(models_path_prefix, "facebook/opt-125m"),
-        chatml_jinja_path,
-        True,
-        False,
-        """<|im_start|>user
-Hello<|im_end|>
-<|im_start|>assistant
-Hi there!<|im_end|>
-<|im_start|>user
-What is the capital of<|im_end|>
-<|im_start|>assistant
-""",
-    ),
-    (
-        os.path.join(models_path_prefix, "facebook/opt-125m"),
-        chatml_jinja_path,
-        False,
-        False,
-        """<|im_start|>user
-Hello<|im_end|>
-<|im_start|>assistant
-Hi there!<|im_end|>
-<|im_start|>user
-What is the capital of""",
-    ),
-    (
-        os.path.join(models_path_prefix, "facebook/opt-125m"),
-        chatml_jinja_path,
-        False,
-        True,
-        """<|im_start|>user
-Hello<|im_end|>
-<|im_start|>assistant
-Hi there!<|im_end|>
-<|im_start|>user
-What is the capital of<|im_end|>
-<|im_start|>assistant
-The capital of""",
-    ),
-]
-TEST_MESSAGES = [
-    {"role": "user", "content": "Hello"},
-    {"role": "assistant", "content": "Hi there!"},
-    {"role": "user", "content": "What is the capital of"},
-]
-ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
-def test_load_chat_template():
-    # Testing chatml template
-    template_content = load_chat_template(chat_template=chatml_jinja_path)
-    # Test assertions
-    assert template_content is not None
-    # Hard coded value for template_chatml.jinja
-    assert (
-        template_content
-        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
-    )
-def test_no_load_chat_template_filelike():
-    # Testing chatml template
-    template = "../../examples/does_not_exist"
-    with pytest.raises(ValueError, match="looks like a file path"):
-        load_chat_template(chat_template=template)
-def test_no_load_chat_template_literallike():
-    # Testing chatml template
-    template = "{{ messages }}"
-    template_content = load_chat_template(chat_template=template)
-    assert template_content == template
-@pytest.mark.parametrize(
-    "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATION_OUTPUT,
-)
-def test_get_gen_prompt(
-    model, template, add_generation_prompt, continue_final_message, expected_output
-):
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
-    model_info.check_available_online(on_fail="skip")
-    model_config = ModelConfig(
-        model,
-        tokenizer=model_info.tokenizer or model,
-        tokenizer_mode=model_info.tokenizer_mode,
-        trust_remote_code=model_info.trust_remote_code,
-        revision=model_info.revision,
-        hf_overrides=model_info.hf_overrides,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
-    )
-    # Initialize the tokenizer
-    tokenizer = get_tokenizer(
-        tokenizer_name=model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
-    )
-    template_content = load_chat_template(chat_template=template)
-    # Create a mock request object using keyword arguments
-    mock_request = ChatCompletionRequest(
-        model=model,
-        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
-        if continue_final_message
-        else TEST_MESSAGES,
-        add_generation_prompt=add_generation_prompt,
-        continue_final_message=continue_final_message,
-    )
-    # Call the function and get the result
-    result = apply_hf_chat_template(
-        tokenizer=tokenizer,
-        conversation=mock_request.messages,
-        chat_template=mock_request.chat_template or template_content,
-        model_config=model_config,
-        tools=None,
-        add_generation_prompt=mock_request.add_generation_prompt,
-        continue_final_message=mock_request.continue_final_message,
-    )
-    # Test assertion
-    assert result == expected_output, (
-        f"The generated prompt does not match the expected output for "
-        f"model {model} and template {template}"
-    )
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
-import numpy as np
-import openai
-import pytest
-import os
-import pytest_asyncio
-import requests
-import torch
-import torch.nn.functional as F
-from tests.models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
-from tests.models.utils import check_embeddings_close
-from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
-DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
-DTYPE = "bfloat16"
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        "--runner",
-        "pooling",
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        DTYPE,
-        "--enforce-eager",
-        "--max-model-len",
-        "512",
-        "--chat-template",
-        DUMMY_CHAT_TEMPLATE,
-    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-@pytest.fixture(scope="module")
-def hf_model(hf_runner):
-    with hf_runner(MODEL_NAME, dtype=DTYPE,
-                   is_sentence_transformer=True) as hf_model:
-        yield hf_model
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
-                                model_name: str):
-    input_texts = [
-        "The chef prepared a delicious meal.",
-    ]
-    # test single embedding
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 11
-    assert embeddings.usage.total_tokens == 11
-    vllm_outputs = [d.embedding for d in embeddings.data]
-    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
-    # test using token IDs
-    input_tokens = [1, 1, 1, 1, 1]
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 5
-    assert embeddings.usage.total_tokens == 5
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
-                               model_name: str):
-    # test list[str]
-    input_texts = [
-        "The cat sat on the mat.", "A feline was resting on a rug.",
-        "Stars twinkle brightly in the night sky."
-    ]
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        encoding_format="float",
-    )
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 33
-    assert embeddings.usage.total_tokens == 33
-    vllm_outputs = [d.embedding for d in embeddings.data]
-    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
-    # test list[list[int]]
-    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
-                    [25, 32, 64, 77]]
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        encoding_format="float",
-    )
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 17
-    assert embeddings.usage.total_tokens == 17
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_conversation_embedding(server: RemoteOpenAIServer,
-                                      client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
-    chat_response = requests.post(
-        server.url_for("v1/embeddings"),
-        json={
-            "model": model_name,
-            "messages": messages,
-            "encoding_format": "float",
-        },
-    )
-    chat_response.raise_for_status()
-    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        chat_template=DUMMY_CHAT_TEMPLATE,
-        add_generation_prompt=True,
-        continue_final_message=False,
-        tokenize=False,
-    )
-    completion_response = await client.embeddings.create(
-        model=model_name,
-        input=prompt,
-        encoding_format="float",
-        # To be consistent with chat
-        extra_body={"add_special_tokens": False},
-    )
-    completion_embeddings = EmbeddingResponse.model_validate(
-        completion_response.model_dump(mode="json"))
-    assert chat_embeddings.id is not None
-    assert completion_embeddings.id is not None
-    assert chat_embeddings.created <= completion_embeddings.created
-    assert chat_embeddings.model_dump(
-        exclude={"id", "created"}) == (completion_embeddings.model_dump(
-            exclude={"id", "created"}))
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
-                                      model_name: str):
-    input_texts = [
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ]
-    responses_float = await client.embeddings.create(input=input_texts,
-                                                     model=model_name,
-                                                     encoding_format="float")
-    float_data = [d.embedding for d in responses_float.data]
-    run_embedding_correctness_test(hf_model, input_texts, float_data)
-    responses_base64 = await client.embeddings.create(input=input_texts,
-                                                      model=model_name,
-                                                      encoding_format="base64")
-    base64_data = []
-    for data in responses_base64.data:
-        base64_data.append(
-            np.frombuffer(base64.b64decode(data.embedding),
-                          dtype="float32").tolist())
-    run_embedding_correctness_test(hf_model, input_texts, base64_data)
-    # Default response is float32 decoded from base64 by OpenAI Client
-    responses_default = await client.embeddings.create(input=input_texts,
-                                                       model=model_name)
-    default_data = [d.embedding for d in responses_default.data]
-    run_embedding_correctness_test(hf_model, input_texts, default_data)
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
-                                           model_name: str):
-    input_texts = [
-        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
-    ]
-    # test single embedding
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_texts,
-        extra_body={"truncate_prompt_tokens": 10})
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 10
-    assert embeddings.usage.total_tokens == 10
-    input_tokens = [
-        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
-        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
-    ]
-    embedding_response = await client.embeddings.create(
-        model=model_name,
-        input=input_tokens,
-        extra_body={"truncate_prompt_tokens": 10})
-    embeddings = EmbeddingResponse.model_validate(
-        embedding_response.model_dump(mode="json"))
-    assert embeddings.id is not None
-    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 384
-    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 10
-    assert embeddings.usage.total_tokens == 10
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
-                                                   model_name: str):
-    input_texts = [
-        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
-    ]
-    with pytest.raises(openai.BadRequestError):
-        response = await client.embeddings.create(
-            model=model_name,
-            input=input_texts,
-            extra_body={"truncate_prompt_tokens": 8193})
-        assert "error" in response.object
-        assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in response.message
-@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    input_texts = [
-        "The chef prepared a delicious meal.",
-    ]
-    request_args = {
-        "model": MODEL_NAME,
-        "input": input_texts,
-        "encoding_format": "float",
-    }
-    completion_response = await client.embeddings.create(**request_args)
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
-    invocation_response.raise_for_status()
-    completion_output = completion_response.model_dump()
-    invocation_output = invocation_response.json()
-    assert completion_output.keys() == invocation_output.keys()
-    for completion_data, invocation_data in zip(completion_output["data"],
-                                                invocation_output["data"]):
-        assert completion_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="completion",
-                               name_1="invocation")
-@pytest.mark.asyncio
-async def test_invocations_conversation(server: RemoteOpenAIServer):
-    messages = [{
-        "role": "user",
-        "content": "The cat sat on the mat.",
-    }, {
-        "role": "assistant",
-        "content": "A feline was resting on a rug.",
-    }, {
-        "role": "user",
-        "content": "Stars twinkle brightly in the night sky.",
-    }]
-    request_args = {
-        "model": MODEL_NAME,
-        "messages": messages,
-        "encoding_format": "float",
-    }
-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json=request_args)
-    chat_response.raise_for_status()
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
-    invocation_response.raise_for_status()
-    chat_output = chat_response.json()
-    invocation_output = invocation_response.json()
-    assert chat_output.keys() == invocation_output.keys()
-    for chat_data, invocation_data in zip(chat_output["data"],
-                                          invocation_output["data"]):
-        assert chat_data.keys() == invocation_data.keys()
-        check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
-                               embeddings_1_lst=[invocation_data["embedding"]],
-                               name_0="chat",
-                               name_1="invocation")
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_normalize(server: RemoteOpenAIServer, model_name: str):
-    input_text = ["The chef prepared a delicious meal."]
-    async def get_outputs(normalize):
-        request_args = {
-            "model": MODEL_NAME,
-            "input": input_text,
-            "encoding_format": "float",
-            "normalize": normalize
-        }
-        response = requests.post(server.url_for("v1/embeddings"),
-                                 json=request_args)
-        outputs = response.json()
-        return torch.tensor([x['embedding'] for x in outputs["data"]])
-    default = await get_outputs(normalize=None)
-    w_normal = await get_outputs(normalize=True)
-    wo_normal = await get_outputs(normalize=False)
-    assert torch.allclose(default, w_normal,
-                          atol=1e-2), "Default should use normal."
-    assert not torch.allclose(w_normal, wo_normal,
-                              atol=1e-2), "wo_normal should not use normal."
-    assert torch.allclose(
-        w_normal, F.normalize(wo_normal, p=2, dim=-1),
-        atol=1e-2), "w_normal should be close to normal(wo_normal)."
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
-# SPDX-License-Identifier: Apache-2.0
-import os
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
-import pytest
-import requests
-import torch
-import torch.nn.functional as F
-from torch import tensor
-from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ScoreResponse
-from utils import models_path_prefix
-MODELS = [
-    {
-        "name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
-        "is_cross_encoder": True
-    },
-    {
-        "name": "BAAI/bge-base-en-v1.5",
-        "is_cross_encoder": False
-    },
-]
-DTYPE = "half"
-def run_transformers(hf_model, model, text_pairs):
-    if model["is_cross_encoder"]:
-        return hf_model.predict(text_pairs).tolist()
-    else:
-        hf_embeddings = [
-            hf_model.encode(text_pair) for text_pair in text_pairs
-        ]
-        return [
-            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
-            for pair in hf_embeddings
-        ]
-@pytest.fixture(scope="class", params=MODELS)
-def model(request):
-    yield request.param
-@pytest.fixture(scope="class")
-def server(model: dict[str, Any]):
-    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
-    with RemoteOpenAIServer(model["name"], args) as remote_server:
-        yield remote_server
-@pytest.fixture(scope="class")
-def runner(model: dict[str, Any], hf_runner):
-    kwargs = {
-        "dtype": DTYPE,
-        "is_cross_encoder" if model["is_cross_encoder"]\
-              else "is_sentence_transformer": True
-    }
-    with hf_runner(model["name"], **kwargs) as hf_model:
-        yield hf_model
-class TestModel:
-    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
-                                    model: dict[str, Any], runner):
-        text_1 = "What is the capital of France?"
-        text_2 = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
-        ]
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 2
-        vllm_outputs = [d.score for d in score.data]
-        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
-                                     model: dict[str, Any], runner):
-        text_1 = [
-            "What is the capital of the United States?",
-            "What is the capital of France?"
-        ]
-        text_2 = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
-        ]
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 2
-        vllm_outputs = [d.score for d in score.data]
-        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
-                                   model: dict[str, Any], runner):
-        text_1 = "What is the capital of France?"
-        text_2 = "The capital of France is Paris."
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
-        score_response.raise_for_status()
-        score = ScoreResponse.model_validate(score_response.json())
-        assert score.id is not None
-        assert score.data is not None
-        assert len(score.data) == 1
-        vllm_outputs = [d.score for d in score.data]
-        text_pairs = [[text_1, text_2]]
-        hf_outputs = run_transformers(runner, model, text_pairs)
-        for i in range(len(vllm_outputs)):
-            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
-    def test_score_max_model_len(self, server: RemoteOpenAIServer,
-                                 model: dict[str, Any]):
-        text_1 = "What is the capital of France?" * 20
-        text_2 = [
-            "The capital of Brazil is Brasilia.",
-            "The capital of France is Paris."
-        ]
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                       })
-        assert score_response.status_code == 400
-        # Assert just a small fragments of the response
-        assert "Please reduce the length of the input." in \
-            score_response.text
-        # Test truncation
-        score_response = requests.post(server.url_for("score"),
-                                       json={
-                                           "model": model["name"],
-                                           "text_1": text_1,
-                                           "text_2": text_2,
-                                           "truncate_prompt_tokens": 101
-                                       })
-        assert score_response.status_code == 400
-        assert "Please, select a smaller truncation size." in \
-            score_response.text
-    def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
-                                                                       Any]):
-        text_1 = "What is the capital of France?"
-        text_2 = "The capital of France is Paris."
-        request_args = {
-            "model": model["name"],
-            "text_1": text_1,
-            "text_2": text_2,
-        }
-        score_response = requests.post(server.url_for("score"),
-                                       json=request_args)
-        score_response.raise_for_status()
-        invocation_response = requests.post(server.url_for("invocations"),
-                                            json=request_args)
-        invocation_response.raise_for_status()
-        score_output = score_response.json()
-        invocation_output = invocation_response.json()
-        assert score_output.keys() == invocation_output.keys()
-        for score_data, invocation_data in zip(score_output["data"],
-                                               invocation_output["data"]):
-            assert score_data.keys() == invocation_data.keys()
-            assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.05)
-            # TODO: reset this tolerance to 0.01 once we find
-            # an alternative to flash_attn with bfloat16
-    def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
-                                                                      Any]):
-        def get_outputs(activation):
-            text_1 = "What is the capital of France?"
-            text_2 = "The capital of France is Paris."
-            response = requests.post(server.url_for("score"),
-                                     json={
-                                         "model": model["name"],
-                                         "text_1": text_1,
-                                         "text_2": text_2,
-                                         "activation": activation
-                                     })
-            if response.status_code != 200:
-                return response
-            outputs = response.json()
-            return torch.tensor([x['score'] for x in outputs["data"]])
-        if model["is_cross_encoder"]:
-            default = get_outputs(activation=None)
-            w_activation = get_outputs(activation=True)
-            wo_activation = get_outputs(activation=False)
-            assert torch.allclose(default, w_activation,
-                                  atol=1e-2), "Default should use activation."
-            assert not torch.allclose(
-                w_activation, wo_activation,
-                atol=1e-2), "wo_activation should not use activation."
-            assert torch.allclose(
-                F.sigmoid(wo_activation), w_activation, atol=1e-2
-            ), "w_activation should be close to activation(wo_activation)."
-        else:
-            get_outputs(activation=None)
-            # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(activation=True)
-            assert response.status_code == 400
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -15,9 +15,6 @@ from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 from vllm.utils.torch_utils import set_random_seed
-if current_platform.is_rocm():
-    from flash_attn import vllm_flash_attn_with_kvcache
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer

--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -162,7 +162,6 @@ def test_reshape_and_cache(
        torch.testing.assert_close(key_cache, cloned_key_cache)
        torch.testing.assert_close(value_cache, cloned_value_cache)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)

--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -104,16 +104,18 @@ def test_flash_mla(
        descale_k = None
    def flash_mla():
-        return flash_mla_with_kvcache(q,
+        return flash_mla_with_kvcache(
-                                      blocked_k,
+            q,
-                                      block_table,
+            blocked_k,
-                                      cache_seqlens,
+            block_table,
-                                      dv,
+            cache_seqlens,
-                                      tile_scheduler_metadata,
+            dv,
-                                      num_splits,
+            tile_scheduler_metadata,
-                                      causal=causal,
+            num_splits,
-                                      descale_q=descale_q,
+            causal=causal,
-                                      descale_k=descale_k)
+            descale_q=descale_q,
+            descale_k=descale_k,
+        )
    def scaled_dot_product_attention(query, key, value, is_causal=False):
        query = query.float()

--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -53,7 +53,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
        dtype=torch.float32,
        device="cuda",
    )
-    best_config = None
    # Call the original implementation.
    decode_attention_fwd(
@@ -67,7 +66,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
        attn_logits,
        num_kv_splits,
        sm_scale,
-        best_config,
    )
    # Page size can be larger than 1.
@@ -88,7 +86,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
        attn_logits,
        num_kv_splits,
        sm_scale,
-        best_config,
        PAGE_SIZE,
    )

--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -146,84 +146,9 @@ def test_fused_rms_norm_quant(
            (out_quant_fused, x, weight, quant_scale_t, 1e-6),
        )
-# @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+    torch.testing.assert_close(
-# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+        out_quant.to(dtype=torch.float32),
-# @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+        out_quant_fused.to(dtype=torch.float32),
-# @pytest.mark.parametrize("dtype", DTYPES)
+        atol=1e-3,
-# @pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
+        rtol=1e-3,
-# @pytest.mark.parametrize("seed", SEEDS)
+    )
-# @pytest.mark.parametrize("device", CUDA_DEVICES)
\ No newline at end of file
-# @pytest.mark.parametrize("strided_input", [False, True])
-# def test_fused_rms_norm_quant(
-#     num_tokens: int,
-#     hidden_size: int,
-#     add_residual: bool,
-#     dtype: torch.dtype,
-#     quant_scale: float,
-#     seed: int,
-#     device: str,
-#     strided_input: bool,
-# ) -> None:
-#     current_platform.seed_everything(seed)
-#     torch.set_default_device(device)
-#     weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
-#     scale = 1 / (2 * hidden_size)
-#     last_dim = 2 * hidden_size if strided_input else hidden_size
-#     x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
-#     x = x_base[..., :hidden_size]
-#     assert x.is_contiguous() != strided_input
-#     x *= scale
-#     if add_residual:
-#         residual = torch.randn_like(x) * scale
-#         residual_fused = residual.clone()
-#     else:
-#         residual = residual_fused = None
-#     out_norm = torch.empty_like(x)
-#     out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
-#     out_quant_fused = torch.empty_like(out_quant)
-#     quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
-#     if add_residual:
-#         torch.ops._C.fused_add_rms_norm_static_fp8_quant(
-#             out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
-#         )
-#         # Unfused kernel is in-place so it goes second
-#         # Also use a separate clone of x to avoid modifying the input
-#         x_unfused_base = x_base.clone()
-#         x_unfused = x_unfused_base[..., :hidden_size]
-#         assert x_unfused.is_contiguous() != strided_input
-#         torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
-#         torch.ops._C.static_scaled_fp8_quant(
-#             out_quant, x_unfused.contiguous(), quant_scale_t
-#         )
-#         torch.cuda.synchronize()
-#         torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
-#         opcheck(
-#             torch.ops._C.fused_add_rms_norm_static_fp8_quant,
-#             (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
-#         )
-#     else:
-#         torch.ops._C.rms_norm_static_fp8_quant(
-#             out_quant_fused, x, weight, quant_scale_t, 1e-6
-#         )
-#         torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
-#         torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
-#         opcheck(
-#             torch.ops._C.rms_norm_static_fp8_quant,
-#             (out_quant_fused, x, weight, quant_scale_t, 1e-6),
-#         )
-#     torch.testing.assert_close(
-#         out_quant.to(dtype=torch.float32),
-#         out_quant_fused.to(dtype=torch.float32),
-#         atol=1e-3,
-#         rtol=1e-3,
-#     )
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -22,9 +22,6 @@ from vllm.distributed import (
 )
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.all2all_utils import (
-    maybe_make_prepare_finalize,
-)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -43,6 +40,7 @@ from .mk_objects import (
    TestMoEQuantConfig,
    expert_info,
    make_fused_experts,
+    make_prepare_finalize,
    prepare_finalize_info,
 )
 from .parallel_utils import ProcessGroupInfo
@@ -605,12 +603,10 @@ def make_modular_kernel(
        routing_method=RoutingMethodType.DeepSeekV3,
    )
-    prepare_finalize = maybe_make_prepare_finalize(
+    # make modular kernel
-        moe=moe,
+    prepare_finalize = make_prepare_finalize(
-        quant_config=quant_config,
+        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
-        allow_new_interface=True,
    )
-    assert prepare_finalize is not None
    fused_experts = make_fused_experts(
        config.fused_experts_type,
@@ -692,4 +688,4 @@ def run_modular_kernel(
    ):
        out = mk.forward(**mk_kwargs)
    return out
\ No newline at end of file
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -7,6 +7,9 @@ import torch
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
 )
@@ -252,12 +255,13 @@ if has_pplx():
    )
 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
-    from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize,
-    )
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
        FlashInferExperts,
    )
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        FlashInferCutlassMoEPrepareAndFinalize,
+        create_flashinfer_prepare_finalize,
+    )
    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
    ]
+def make_prepare_finalize(
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    backend: str | None,
+    moe: FusedMoEConfig,
+    quant_config: FusedMoEQuantConfig,
+) -> mk.FusedMoEPrepareAndFinalize:
+    if backend != "naive" and backend is not None:
+        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
+        assert prepare_finalize is not None
+        return prepare_finalize
+    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
+        return create_flashinfer_prepare_finalize(
+            use_dp=moe.moe_parallel_config.dp_size > 1
+        )
+    else:
+        return MoEPrepareAndFinalizeNoEP()
 def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
    s = rank * num_local_experts
    e = s + num_local_experts
@@ -473,4 +495,4 @@ def make_fused_experts(
    torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80)
    return experts
\ No newline at end of file
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        )
        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
+                    moe_config=moe_config,
+                    quant_config=quant_config,
+                )
+            ),
            FlashInferExperts(
                moe_config=moe_config,
                quant_config=quant_config,
@@ -315,4 +320,4 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        )
        torch.testing.assert_close(
            output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
        )
\ No newline at end of file
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
        )
        flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
+                    moe_config=moe_config,
+                    quant_config=quant_config,
+                )
+            ),
            FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
        )
@@ -169,4 +174,4 @@ def test_flashinfer_fp4_moe_no_graph(
 if __name__ == "__main__":
    test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
\ No newline at end of file
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
 )
 from vllm.utils.torch_utils import set_random_seed
-device = "cuda"  
+device = "cuda"
 def reverse_awq_order(t: torch.Tensor):
@@ -168,4 +168,4 @@ def test_gemm(N, K, M, splitK, group_size):
    torch.testing.assert_close(
        output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1
    )
\ No newline at end of file
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -8,6 +8,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
 from vllm.utils.torch_utils import set_random_seed
+from vllm.platforms import current_platform
 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -63,7 +64,6 @@ def test_dynamic_scaled_int8_quant(
    opcheck_int8_quant_dynamic(ops_out, x)
 @pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Currently, there is not supported on ROCm.")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -169,7 +169,6 @@ def test_static_scaled_int8_azp_quant(
    torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
    opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
 @pytest.mark.parametrize("is_max", [True, False])
 @torch.inference_mode()

--- a/tests/kernels/untest_triton_flash_attention.py
+++ b/tests/kernels/untest_triton_flash_attention.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the triton_flash_attention kernel
-Run `pytest tests/kernels/test_triton_flash_attention.py`.
-"""
-import pytest
-import torch
-from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
-                                                       MetaData,
-                                                       compute_alibi_tensor,
-                                                       scale_fp8,
-                                                       triton_attention_rocm)
-from vllm.platforms import current_platform
-class ReferenceAttention:
-    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
-                 input_metadata):
-        self.Z = Z
-        self.HQ = HQ
-        self.HK = HK
-        self.N_CTX_Q = N_CTX_Q
-        self.N_CTX_K = N_CTX_K
-        self.D_HEAD = D_HEAD
-        self.use_alibi = use_alibi
-        self.dtype = dtype
-        self.input_metadata = input_metadata
-    def fwd(self, q, k, v):
-        scores = torch.einsum('bhqd,bhkd->bhqk', q,
-                              k).float() * self.input_metadata.sm_scale
-        if self.input_metadata.causal:
-            mask = torch.tril(torch.ones(self.N_CTX_Q,
-                                         self.N_CTX_K,
-                                         device="cuda"),
-                              diagonal=self.N_CTX_K - self.N_CTX_Q)
-            scores[:, :, mask == 0] = float("-inf")
-        if self.input_metadata.bias is not None:
-            scores += self.input_metadata.bias
-        if self.use_alibi:
-            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
-                                           self.N_CTX_Q, self.N_CTX_K)
-        p = torch.softmax(scores, dim=-1)
-        if self.input_metadata.causal:
-            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
-            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
-            # So we fix this by converting the NaNs to 0s, which is what they
-            # should be out of the softmax.
-            nan_mask = torch.isnan(p)
-            p[nan_mask == 1] = 0
-        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
-        # compare
-        if self.input_metadata.layout == 'bshd':
-            ref_out = ref_out.transpose(1, 2).clone()
-        return ref_out
-    # def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
-    #     q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
-    #         self.dtype)
-    #     k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
-    #         self.dtype)
-    #     v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
-    #         self.dtype)
-    #     result = self.fwd(q, k, v)
-    #     if self.input_metadata.o_scale is not None:
-    #         result, _ = scale_fp8(result, self.input_metadata.o_scale)
-    #     return result
-    # def fwd_fp8_kv(self, q, k_quantized, v_quantized):
-    #     k_descale, v_descale = (self.input_metadata.k_descale,
-    #                             self.input_metadata.v_descale)
-    #     k_dequantized = (k_quantized.to(torch.float32) *
-    #                      k_descale.to(torch.float32)).to(self.dtype)
-    #     v_dequantized = (v_quantized.to(torch.float32) *
-    #                      v_descale.to(torch.float32)).to(self.dtype)
-    #     return self.fwd(q, k_dequantized, v_dequantized)
-    def varlen_fwd(self, q, k, v, is_mqa=False):
-        ref_out = torch.empty_like(q)
-        if is_mqa:
-            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
-            # the size aligns with Q.
-            k_ref = k.view(k.shape[0], k.shape[1], 1,
-                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
-            v_ref = v.view(v.shape[0], v.shape[1], 1,
-                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
-        else:
-            k_ref = k
-            v_ref = v
-        for i in range(0, self.input_metadata.num_contexts):
-            start_q, start_k = self.input_metadata.cu_seqlens_q[
-                i], self.input_metadata.cu_seqlens_k[i]
-            end_q, end_k = self.input_metadata.cu_seqlens_q[
-                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
-            k_curr = k_ref[start_k:end_k]
-            v_curr = v_ref[start_k:end_k]
-            if is_mqa:
-                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
-                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
-            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
-                                  k_curr).float()
-            p = torch.softmax(scores * self.input_metadata.sm_scale,
-                              dim=-1).half()
-            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
-        return ref_out
-def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
-    q_descale = None
-    if not fp8_kv:
-        q, q_descale = scale_fp8(q)
-    k, k_descale = scale_fp8(k)
-    v, v_descale = scale_fp8(v)
-    # In real world use case, the p scale would be a parameter trained by the
-    # model.
-    p_scale = None
-    o_scale = torch.rand(1, device="cuda",
-                         requires_grad=False) if use_o_scale else None
-    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
-def input_helper(
-    Z,
-    HQ,
-    HK,
-    N_CTX_Q,
-    N_CTX_K,
-    D_HEAD,
-    dtype,
-    layout=None,
-    use_alibi=None,
-    causal=None,
-    is_fp8=False,
-    fp8_kv=False,
-    use_o_scale=False,
-    use_bias=False,
-):
-    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
-    current_platform.seed_everything(0)
-    # Initialize q, k, v
-    if layout == 'bhsd':
-        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
-        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
-    elif layout == 'bshd':
-        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
-        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
-    if use_alibi:
-        # for n heads the set of slopes is the geometric sequence that starts
-        # 2^(-8/n)
-        alibi_slopes = torch.tensor(
-            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
-            dtype=torch.float32,
-            device="cuda").repeat(Z, 1)
-    else:
-        alibi_slopes = None
-    if use_bias:
-        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
-                           dtype=dtype,
-                           device="cuda",
-                           requires_grad=False)
-    else:
-        bias = None
-    q = torch.randn(q_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    k = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    v = torch.randn(k_tensor_shape,
-                    dtype=dtype,
-                    device="cuda",
-                    requires_grad=False)
-    if is_fp8:
-        (q, k, v, q_descale, k_descale, v_descale, p_scale,
-         o_scale) = quantize_input(q,
-                                   k,
-                                   v,
-                                   use_o_scale=use_o_scale,
-                                   fp8_kv=fp8_kv)
-    else:
-        q_descale = k_descale = v_descale = p_scale = o_scale = None
-    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
-                              max_seqlens_q=N_CTX_Q,
-                              max_seqlens_k=N_CTX_K,
-                              layout=layout,
-                              alibi_slopes=alibi_slopes,
-                              alibi_batch=Z,
-                              alibi_nheads=HQ,
-                              q_descale=q_descale,
-                              k_descale=k_descale,
-                              v_descale=v_descale,
-                              p_scale=p_scale,
-                              o_scale=o_scale,
-                              bias=bias,
-                              seqlen_q=N_CTX_Q,
-                              seqlen_k=N_CTX_K)
-    return q, k, v, input_metadata
-def varlen_input_helper(Z,
-                        HQ,
-                        HK,
-                        N_CTX_Q,
-                        N_CTX_K,
-                        D_HEAD,
-                        dtype,
-                        equal_seqlens=False):
-    current_platform.seed_everything(0)
-    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
-    # seqs
-    if not equal_seqlens:
-        max_seqlens_q = N_CTX_Q // Z
-        max_seqlens_k = N_CTX_K // Z
-        seqlens_q = torch.randint(1,
-                                  max_seqlens_q + 1, (Z, ),
-                                  dtype=torch.int32)
-        seqlens_k = torch.randint(1,
-                                  max_seqlens_k + 1, (Z, ),
-                                  dtype=torch.int32)
-    else:
-        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
-        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
-    # Calculate cumulative sequence lengths
-    cu_seqlens_q = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_q.cumsum(dim=0, dtype=torch.int32)
-    ])
-    cu_seqlens_k = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        seqlens_k.cumsum(dim=0, dtype=torch.int32)
-    ])
-    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
-    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
-    # Initialize q, k, v with variable lengths
-    total_q = cu_seqlens_q[-1].item()
-    total_k = cu_seqlens_k[-1].item()
-    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
-                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
-    sm_scale = D_HEAD**-0.5
-    input_metadata = MetaData(sm_scale=sm_scale)
-    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
-    return q, k, v, input_metadata
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (1, 48, 12, 1, 1, 64),
-    (4, 4, 4, 128, 128, 65),
-    (16, 48, 48, 1, 1, 128),
-    (64, 48, 24, 3, 3, 128),
-    (4, 4, 4, 113, 123, 1),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_alibi', [True, False])
-@pytest.mark.parametrize('layout', ['bshd'])
-def test_op_fwd(Z,
-                HQ,
-                HK,
-                N_CTX_Q,
-                N_CTX_K,
-                D_HEAD,
-                causal,
-                use_alibi,
-                layout,
-                dtype=torch.float16):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                           dtype, layout, use_alibi, causal)
-    o = torch.empty_like(q)
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-    # Transpose here if layout is bshd so we have same reference code for all
-    # layouts
-    if layout == 'bshd':
-        q = q.transpose(1, 2).clone()
-        k = k.transpose(1, 2).clone()
-        v = v.transpose(1, 2).clone()
-    # Replicate K and V if using MQA/GQA
-    if HQ != HK:
-        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
-                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(k.shape[0], -1, k.shape[2],
-                                                  k.shape[3])
-        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
-                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
-                                      -1).reshape(v.shape[0], -1, v.shape[2],
-                                                  v.shape[3])
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
-                                  use_alibi, dtype, input_metadata)
-    ref_out = ref_impl.fwd(q, k, v)
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-#     (4, 48, 1, 1, 64),
-#     (4, 48, 1, 1, 128),
-#     (4, 48, 3, 3, 128),
-#     (4, 4, 128, 128, 65),
-# ])
-# @pytest.mark.parametrize('causal', [True, False])
-# @pytest.mark.parametrize('layout', ['bhsd'])
-# @pytest.mark.parametrize('use_o_scale', [True, False])
-# @pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
-#                     reason="Triton FP8 requires CUDA 9.0 or higher")
-# def test_op_fwd_fp8(Z,
-#                     H,
-#                     N_CTX_Q,
-#                     N_CTX_K,
-#                     D_HEAD,
-#                     causal,
-#                     layout,
-#                     use_o_scale,
-#                     dtype=torch.float32):
-#     current_platform.seed_everything(0)
-#     # Disable grad to save memory it won't run into OOM on CI machine.
-#     # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
-#     # dtype, layout)
-#     q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
-#         Z,
-#         H,
-#         H,
-#         N_CTX_Q,
-#         N_CTX_K,
-#         D_HEAD,
-#         dtype,
-#         causal=causal,
-#         layout=layout,
-#         is_fp8=True,
-#         use_o_scale=use_o_scale)
-#     o = torch.empty_like(q_quantized) if use_o_scale else None
-#     tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
-#                                        o, input_metadata)
-#     ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-#                                   dtype, input_metadata)
-#     ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
-#     # compare
-#     torch.testing.assert_close(ref_out.to(torch.float32),
-#                                tri_out.to(torch.float32),
-#                                atol=7e-2,
-#                                rtol=2e-1)
-# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-#     (4, 48, 1, 1, 64),
-#     (4, 48, 1, 1, 128),
-#     (4, 48, 3, 3, 128),
-#     (4, 4, 128, 128, 65),
-#     (4, 4, 113, 123, 1),
-# ])
-# @pytest.mark.parametrize('causal', [True, False])
-# @pytest.mark.parametrize('layout', ['bhsd'])
-# def test_op_fwd_fp8_kv(Z,
-#                        H,
-#                        N_CTX_Q,
-#                        N_CTX_K,
-#                        D_HEAD,
-#                        causal,
-#                        layout,
-#                        dtype=torch.float32):
-#     current_platform.seed_everything(0)
-#     q, k_quantized, v_quantized, input_metadata = input_helper(Z,
-#                                                                H,
-#                                                                H,
-#                                                                N_CTX_Q,
-#                                                                N_CTX_K,
-#                                                                D_HEAD,
-#                                                                dtype,
-#                                                                causal=causal,
-#                                                                layout=layout,
-#                                                                is_fp8=True,
-#                                                                fp8_kv=True)
-#     o = torch.empty_like(q)
-#     tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
-#                                        input_metadata)
-#     ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-#                                   dtype, input_metadata)
-#     ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
-#     torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
-@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
-    (4, 48, 1, 1, 64),
-    (4, 48, 1, 1, 128),
-    (4, 48, 3, 3, 128),
-    (4, 4, 128, 128, 65),
-])
-@pytest.mark.parametrize('causal', [True, False])
-@pytest.mark.parametrize('use_bias', [True])
-@pytest.mark.parametrize('dtype', [torch.bfloat16])
-def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
-    current_platform.seed_everything(0)
-    q, k, v, input_metadata = input_helper(Z,
-                                           H,
-                                           H,
-                                           N_CTX_Q,
-                                           N_CTX_K,
-                                           D_HEAD,
-                                           dtype,
-                                           layout='bhsd',
-                                           causal=causal,
-                                           use_bias=use_bias)
-    o = torch.empty_like(q)
-    # triton implementation
-    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.fwd(q, k, v)
-    # compare
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
-                                                 (4, 48, 512, 64),
-                                                 (16, 48, 512, 64),
-                                                 (64, 48, 128, 128)])
-@pytest.mark.parametrize('causal', [True, False])
-def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
-                                  input_metadata)
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
-# NOTE: Uses thd layout, so also tests thd.
-@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
-                                                      (4, 48, 12, 256, 64),
-                                                      (4, 48, 4, 512, 64),
-                                                      (4, 64, 16, 128, 128)])
-@pytest.mark.parametrize('causal', [False])
-def test_op_varlen_mqa_fwd(Z,
-                           HQ,
-                           HK,
-                           N_CTX,
-                           D_HEAD,
-                           causal,
-                           dtype=torch.float16):
-    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
-                                                  D_HEAD, dtype)
-    tri_out = torch.empty_like(q)
-    triton_attention_rocm(q, k, v, tri_out, input_metadata)
-    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
-                                  dtype, input_metadata)
-    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
-    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 from ..utils import models_path_prefix
-MODEL_PATH = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-0.6B")
 NUM_LORAS = 16

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
+import os
 import numpy as np
 import pytest
 import torch
@@ -9,23 +10,10 @@ from transformers import AutoModelForTokenClassification
 from tests.models.utils import softmax
 from vllm.platforms import current_platform
+from ....utils import models_path_prefix
-@pytest.fixture(autouse=True)
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "boltuix/NeuroBERT-NER")])
-def seed_everything():
-    """Seed all random number generators for reproducibility."""
-    seed = 0
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    yield
-@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
 # The float32 is required for this tiny model to pass the test.
 @pytest.mark.parametrize("dtype", ["float"])
 @torch.inference_mode
@@ -68,7 +56,6 @@ def test_bert_models(
 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
 @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.flaky(reruns=3)
 @torch.inference_mode
 def test_modernbert_models(
    hf_runner,
@@ -77,14 +64,6 @@ def test_modernbert_models(
    model: str,
    dtype: str,
 ) -> None:
-    # NOTE: https://github.com/vllm-project/vllm/pull/32403
-    # `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
-    # model, which can cause numerical precision variance and edge cases.
-    # We use @flaky(reruns=3) to mitigate intermittent failures.
-    print(
-        f"\n[NOTE] Testing {model} (randomly initialized weights) - "
-        "flaky tolerance enabled due to numerical precision variance."
-    )
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)