Merge tag 'v0.9.2' into v0.9.2-dev

a40a133c · zhuwenwen · 1a9a61d7 · a5dd03c1 · a40a133c · a40a133c
Commit a40a133c authored Jul 18, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -99,7 +99,7 @@ def _run_test(
                     max_model_len=8192) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
        texts = [
-            # this is necessary because vllm_model.encode will not apply any
+            # this is necessary because vllm_model.embed will not apply any
            # templating to the prompt, and therefore lacks an image_pad
            # token unless one is inserted beforehand (the (28,28) image
            # above is converted to an image pad token by the chat template).
@@ -110,7 +110,7 @@ def _run_test(
            # vllm will replace the pad token with the actual image,
            # which may be a placeholder image, later.
        ]
-        vllm_outputs = vllm_model.encode(texts, images=input_images)
+        vllm_outputs = vllm_model.embed(texts, images=input_images)

    hf_outputs = []
    with hf_runner(model,

--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -69,7 +69,7 @@ def _run_test(
                     dtype=dtype,
                     max_model_len=4096,
                     enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    with hf_runner(model, dtype=dtype,
                   auto_cls=AutoModelForImageTextToText) as hf_model:

--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -47,7 +47,7 @@ def _run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model, task="embed", dtype=dtype,
                     enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
+        vllm_outputs = vllm_model.embed(input_texts, images=input_images)

    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
    hf_model_kwargs = {"_attn_implementation": "eager"}

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -26,6 +26,22 @@ from ...registry import HF_EXAMPLE_MODELS
 from ....utils import models_path_prefix


+def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
+    """
+    Patch the multimodal data for GLM4.1V model.
+    """
+    # Ensure video metadata is included
+    if "video" in mm_data:
+        video = mm_data["video"]
+        mm_data["video"] = (video, {
+            "total_num_frames": len(video),
+            "fps": len(video),
+            "duration": 1,
+            "video_backend": "opencv"
+        })
+    return mm_data
+
+
 def _test_processing_correctness(
    model_id: str,
    hit_rate: float,
@@ -156,6 +172,11 @@ _IGNORE_MM_KEYS = {
    "ultravox": {"audio_features"},
 }

+MM_DATA_PATCHES = {
+    # GLM4.1V requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+}
+

 def _test_processing_correctness_one(
    model_config: ModelConfig,
@@ -168,6 +189,8 @@ def _test_processing_correctness_one(
 ):
    model_type = model_config.hf_config.model_type
    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)

    if isinstance(prompt, str):
        text_prompt = prompt
@@ -247,6 +270,7 @@ def _test_processing_correctness_one(
    os.path.join(models_path_prefix, "adept/fuyu-8b"),
    os.path.join(models_path_prefix, "google/gemma-3-4b-it"),
    os.path.join(models_path_prefix, "THUDM/glm-4v-9b"),
+    os.path.join(models_path_prefix, "THUDM/GLM-4.1V-9B-Thinking"),
    os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-2b"),
    os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
    os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
@@ -286,6 +310,7 @@ def _test_processing_correctness_one(
    os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b"),
    os.path.join(models_path_prefix, "openai/whisper-large-v3"),
    os.path.join(models_path_prefix, "omni-research/Tarsier-7b"),
+    os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])

--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import pytest
+import torch
+import transformers
+from transformers import AutoConfig, PreTrainedModel
+
+from vllm.config import ModelConfig
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.config import try_get_safetensors_metadata
+
+from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+
+
+def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
+    """Create weights from safetensors checkpoint metadata"""
+    metadata = try_get_safetensors_metadata(repo)
+    weight_names = list(metadata.weight_map.keys())
+    with torch.device('meta'):
+        return ((name, torch.empty(0)) for name in weight_names)
+
+
+def create_model_dummy_weights(
+    repo: str,
+    model_arch: str,
+) -> Iterable[tuple[str, torch.Tensor]]:
+    """
+    Create weights from a dummy meta deserialized hf model with name conversion
+    """
+    model_cls: PreTrainedModel = getattr(transformers, model_arch)
+    config = AutoConfig.from_pretrained(repo)
+    with torch.device("meta"):
+        model: PreTrainedModel = model_cls._from_config(config)
+    return model.named_parameters()
+
+
+def model_architectures_for_test() -> list[str]:
+    arch_to_test = list[str]()
+    for model_arch, info in _MULTIMODAL_EXAMPLE_MODELS.items():
+        if not info.trust_remote_code and hasattr(transformers, model_arch):
+            model_cls: PreTrainedModel = getattr(transformers, model_arch)
+            if getattr(model_cls, "_checkpoint_conversion_mapping", None):
+                arch_to_test.append(model_arch)
+    return arch_to_test
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model_arch", model_architectures_for_test())
+def test_hf_model_weights_mapper(model_arch: str):
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    model_id = model_info.default
+
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        seed=0,
+        dtype="auto",
+        revision=None,
+        hf_overrides=model_info.hf_overrides,
+    )
+    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
+
+    original_weights = create_repo_dummy_weights(model_id)
+    hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
+    mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
+
+    mapped_original_weights = mapper.apply(original_weights)
+    mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
+
+    ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
+    weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
+
+    weights_missing = ref_weight_names - weight_names
+    weights_unmapped = weight_names - ref_weight_names
+    assert (not weights_missing and not weights_unmapped), (
+        f"Following weights are not mapped correctly: {weights_unmapped}, "
+        f"Missing expected weights: {weights_missing}.")
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -80,11 +80,11 @@ DOLPHIN_CONFIG = GGUFTestConfig(
 )

 MODELS = [
-    LLAMA_CONFIG,
+    # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
-    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    STABLELM_CONFIG,
    DOLPHIN_CONFIG,
    # STARCODER_CONFIG, # broken
 ]

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -74,6 +74,12 @@ class _HfExamplesInfo:
    length that is too large to fit into memory in CI.
    """

+    revision: Optional[str] = None
+    """
+    The specific revision (commit hash, tag, or branch) to use for the model.
+    If not specified, the default revision will be used.
+    """
+
    def check_transformers_version(
        self,
        *,
@@ -160,14 +166,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "DeepseekV3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V3"),  # noqa: E501
                                         trust_remote_code=True),
+    "Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-0.3B-PT"),
+                                        trust_remote_code=True),
+    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-21B-A3B-PT"),
+                                        trust_remote_code=True),
    "ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")),  # noqa: E501
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"mgleize/fairseq2-dummy-Llama-3.2-1B")),  # noqa: E501
    "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-7b")),
-    "FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-1.5B-Instruct"),
+    "FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
                                          min_transformers_version="4.53"),
    "GemmaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-1.1-2b-it")),
    "Gemma2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-2-9b")),
    "Gemma3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-3-1b-it")),
+    "Gemma3nForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-3n-E2B-it"),    # noqa: E501
+                                          min_transformers_version="4.53"),
    "GlmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4-9b-chat-hf")),
    "Glm4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4-9B-0414")),
    "GPT2LMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix,"openai-community/gpt2"),
@@ -184,7 +196,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GraniteMoeSharedForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ibm-research/moe-7b-1b-active-shared-experts")),  # noqa: E501
    "Grok1ModelForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"hpcai-tech/grok-1"),
                                             trust_remote_code=True),
-    "InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm-chat-7b"),
+    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tencent/Hunyuan-A13B-Instruct"),
+                                               trust_remote_code=True),
+    "InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"internlm/internlm-chat-7b"),
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"),
                                            trust_remote_code=True),
@@ -196,8 +210,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "JambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/AI21-Jamba-1.5-Mini"),
                                        extras={"tiny": os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-dev")}),  # noqa: E501
    "LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
-                                        extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B"),  # noqa: E501
-                                                "hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B")}),  # noqa: E501
+                                        extras={"guard": os.path.join(models_path_prefix,"meta-llama/Llama-Guard-3-1B",  # noqa: E501
+                                                "hermes": os.path.join(models_path_prefix,"NousResearch/Hermes-3-Llama-3.1-8B"), # noqa: E501
+                                                "fp8": os.path.join(models_path_prefix,"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8")}),  # noqa: E501
    "LLaMAForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"decapoda-research/llama-7b-hf"),
                                        is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"state-spaces/mamba-130m-hf")),
@@ -208,9 +223,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MiniCPM3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
                                         trust_remote_code=True),
    "MiniMaxText01ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
-                                                trust_remote_code=True),
+                                                trust_remote_code=True,
+                                                revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
    "MiniMaxM1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-M1-40k"),
-                                                trust_remote_code=True),
+                                            trust_remote_code=True),
    "MistralForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1")),
    "MixtralForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
                                          {"tiny": os.path.join(models_path_prefix, "TitanML/tiny-mixtral")}),  # noqa: E501
@@ -227,31 +243,31 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                      {"1b": os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b")}),
    "OrionForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"),
                                        trust_remote_code=True),
-    "PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/persimmon-8b-chat")),
-    "PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/phi-2"), v0_only=True),
-    "Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-mini-4k-instruct")),
-    "Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-small-8k-instruct"),
+    "PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"adept/persimmon-8b-chat")),
+    "PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/phi-2")),
+    "Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-mini-4k-instruct")),
+    # Blocksparse attention not supported in V1 yet
+    "Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-small-8k-instruct"),
                                            trust_remote_code=True,
                                            v0_only=True),
-    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
+    "PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3.5-MoE-instruct"),
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
                                        trust_remote_code=True),
    "QWenLMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen-7B-Chat"),
                                       trust_remote_code=True),
-    "Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-0.5B-Instruct"),
-                                        extras={"2.5": os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct")}), # noqa: E501
-    "Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat")),
-    "Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-8B")),
-    "Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B")),
-    "RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-40b")),
-    "StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-zephyr-3b"),  # noqa: E501
-                                                v0_only=True),
-    "StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
-                                           v0_only=True),
-    "Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),
-    "SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct")),
-    "TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Tele-AI/TeleChat2-3B"),
+    "Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2-0.5B-Instruct"),
+                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
+    "Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen1.5-MoE-A2.7B-Chat")),
+    "Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-8B")),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-30B-A3B")),
+    "Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"tomaarsen/Qwen3-Reranker-0.6B-seq-cls")),  # noqa: E501
+    "RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-40b")),
+    "StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-zephyr-3b")),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-3b-4e1t")),
+    "Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigcode/starcoder2-3b")),
+    "SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"upstage/solar-pro-preview-instruct")),
+    "TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Tele-AI/TeleChat2-3B"),
                                            trust_remote_code=True),
    "TeleFLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "CofeAI/FLM-2-52B-Instruct-2407"),
                                            trust_remote_code=True),
@@ -265,6 +281,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        trust_remote_code=True),
    "MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
                                        trust_remote_code=True),
+    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
+                                        min_transformers_version="4.53"),
    # [Encoder-decoder]
    "BartModel": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/bart-base")),
    "BartForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "facebook/bart-large-cnn")),
@@ -272,30 +290,31 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {

 _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
-    "BertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5")),
-    "Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2")),
-    "GritLM": _HfExamplesInfo(os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")),
-    "GteModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-embed-m-v2.0"),
+    "BertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-base-en-v1.5"), v0_only=True),
+    "Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-multilingual-gemma2"), v0_only=True),  # noqa: E501
+    "GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"nie3e/sentiment-polish-gpt2-small")),  # noqa: E501
+    "GritLM": _HfExamplesInfo(os.path.join(models_path_prefix,"parasail-ai/GritLM-7B-vllm")),
+    "GteModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Snowflake/snowflake-arctic-embed-m-v2.0"),
                                               trust_remote_code=True),
    "GteNewModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-base-en-v1.5"),
                                   trust_remote_code=True,
                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
    "InternLM2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"),
                                               trust_remote_code=True),
-    "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")),  # noqa: E501
-    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama", is_available_online=False),
-    "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
-    "ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"),
-                                trust_remote_code=True),
-    "NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "nomic-ai/nomic-embed-text-v2-moe"),
-                                               trust_remote_code=True),
-    "Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base")),
-    "Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B")),
-    "Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-PRM-7B")),
-    "Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/stsb-roberta-base-v2")),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix, "sentence-transformers/all-roberta-large-v1")),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")),
+    "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-reward-dev")),  # noqa: E501
+    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"llama"), is_available_online=False),
+    "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct")),
+    "ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Alibaba-NLP/gte-modernbert-base"),
+                                trust_remote_code=True, v0_only=True),
+    "NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"nomic-ai/nomic-embed-text-v2-moe"),
+                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+    "Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"ssmits/Qwen2-7B-Instruct-embed-base")),
+    "Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-RM-72B")),
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-PRM-7B")),
+    "Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"jason9693/Qwen2.5-1.5B-apeach")),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/stsb-roberta-base-v2"), v0_only=True),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/all-roberta-large-v1"), v0_only=True),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/multilingual-e5-small"), v0_only=True),  # noqa: E501
    # [Multimodal]
    "LlavaNextForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "royokong/e5-v")),
    "Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "TIGER-Lab/VLM2Vec-Full"),
@@ -307,10 +326,10 @@ _EMBEDDING_EXAMPLE_MODELS = {

 _CROSS_ENCODER_EXAMPLE_MODELS = {
    # [Text-only]
-    "BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2")),  # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base")),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base")),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), v0_only=True),  # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base"), v0_only=True),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True),  # noqa: E501
 }

 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -318,8 +337,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "AriaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"rhymes-ai/Aria")),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"CohereForAI/aya-vision-8b")), # noqa: E501
    "Blip2ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b"),  # noqa: E501
-                                                     extras={"6b": os.path.join(models_path_prefix,"Salesforce/blip2-opt-6.7b")},  # noqa: E501
-                                                     v0_only=True),
+                                                     extras={"6b": os.path.join(models_path_prefix,"Salesforce/blip2-opt-6.7b")}),  # noqa: E501
    "ChameleonForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"facebook/chameleon-7b")),  # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"deepseek-ai/deepseek-vl2-tiny"),  # noqa: E501
                                                extras={"fork": os.path.join(models_path_prefix,"Isotr0py/deepseek-vl2-tiny")},  # noqa: E501
@@ -332,8 +350,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "GLM4VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4v-9b"),
                                        trust_remote_code=True,
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
-    "H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
-                                      extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")},  # noqa: E501
+    "Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"),  # noqa: E501
+    "H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
+                                      extras={"2b": os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-2b")},  # noqa: E501
                                      max_transformers_version="4.48",  # noqa: E501
                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
    "InternVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
@@ -342,11 +361,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
                                                        {"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}),  # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"),  # noqa: E501
                                                      extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},  # noqa: E501
-                                                      trust_remote_code=True,
-                                                      v0_only=True),
-    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),   # noqa: E501
+                                                      trust_remote_code=True),
+    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
                                                     extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
@@ -404,6 +424,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                     trust_remote_code=True),
    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
+    "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
+                                                        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
    # [Encoder-decoder]
    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
    # Therefore, we borrow the BartTokenizer from the original Bart model

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
    model_info.check_transformers_version(on_fail="skip")

    # FIXME: Possible memory leak in the previous tests?
-    if model_arch == "GraniteSpeechForConditionalGeneration":
+    if model_arch in ("GraniteSpeechForConditionalGeneration",
+                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")

    # Avoid OOM and reduce initialization time by only using 1 layer
@@ -31,12 +32,21 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):

        text_config = hf_config.get_text_config()

+        # Ensure at least 2 expert per group
+        # Since `grouped_topk` assums top-2
+        n_group = getattr(text_config, 'n_group', None)
+        num_experts = n_group * 2 if n_group is not None else 2
+
        text_config.update({
            "num_layers": 1,
            "num_hidden_layers": 1,
-            "num_experts": 2,
+            "num_experts": num_experts,
            "num_experts_per_tok": 2,
-            "num_local_experts": 2,
+            "num_local_experts": num_experts,
+            # Otherwise there will not be any expert layers
+            "first_k_dense_replace": 0,
+            # To avoid OOM on DeepSeek-V3
+            "n_routed_experts": num_experts,
        })

        if hasattr(hf_config, "vision_config"):
@@ -80,6 +90,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
            model_info.default,
            tokenizer=model_info.tokenizer,
            tokenizer_mode=model_info.tokenizer_mode,
+            revision=model_info.revision,
            speculative_config={
                "model": model_info.speculative_model,
                "num_speculative_tokens": 1,

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -53,7 +53,9 @@ def test_oot_registration_embedding(
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
-        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        llm = LLM(model=dummy_gemma2_embedding_path,
+                  load_format="dummy",
+                  max_model_len=2048)
        outputs = llm.embed(prompts)

        for output in outputs:

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -10,9 +10,9 @@ import torch.cuda
 from vllm.model_executor.models import (is_pooling_model,
                                        is_text_generation_model,
                                        supports_multimodal)
-from vllm.model_executor.models.adapters import (as_classification_model,
-                                                 as_embedding_model,
-                                                 as_reward_model)
+from vllm.model_executor.models.adapters import (as_embedding_model,
+                                                 as_reward_model,
+                                                 as_seq_cls_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                 _SPECULATIVE_DECODING_MODELS,
                                                 _TEXT_GENERATION_MODELS,
@@ -46,7 +46,7 @@ def test_registry_imports(model_arch):
        assert is_text_generation_model(model_cls)

    # All vLLM models should be convertible to a pooling model
-    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_seq_cls_model(model_cls))
    assert is_pooling_model(as_embedding_model(model_cls))
    assert is_pooling_model(as_reward_model(model_cls))


--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -336,3 +336,10 @@ class EmbedModelInfo(NamedTuple):
    architecture: str = ""
    dtype: str = "auto"
    enable_test: bool = True
+
+
+class RerankModelInfo(NamedTuple):
+    name: str
+    architecture: str = ""
+    dtype: str = "auto"
+    enable_test: bool = True
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -68,7 +68,7 @@ async def test_evil_forward(tmp_socket):
        with pytest.raises(MQEngineDeadError):
            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
-                                           request_id=uuid.uuid4()):
+                                           request_id=str(uuid.uuid4())):
                pass
        assert client.errored

@@ -117,7 +117,7 @@ async def test_failed_health_check(tmp_socket):
        with pytest.raises(MQEngineDeadError):
            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
-                                           request_id=uuid.uuid4()):
+                                           request_id=str(uuid.uuid4())):
                pass

        client.close()
@@ -159,7 +159,7 @@ async def test_failed_abort(tmp_socket):
            async for _ in client.generate(
                    prompt="Hello my name is",
                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=uuid.uuid4()):
+                    request_id=str(uuid.uuid4())):
                pass
        assert "KeyError" in repr(execinfo.value)
        assert client.errored
@@ -191,7 +191,7 @@ async def test_batch_error(tmp_socket):
            params = SamplingParams(min_tokens=2048, max_tokens=2048)
            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=params,
-                                           request_id=uuid.uuid4()):
+                                           request_id=str(uuid.uuid4())):
                pass

        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
@@ -291,7 +291,7 @@ async def test_engine_process_death(tmp_socket):
        with pytest.raises(MQEngineDeadError):
            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
-                                           request_id=uuid.uuid4()):
+                                           request_id=str(uuid.uuid4())):
                pass

        # And the health check should show the engine is dead

--- a/tests/multi_step/untest_correctness_llm.py
+++ b/tests/multi_step/untest_correctness_llm.py
@@ -9,6 +9,7 @@ from typing import Optional
 import pytest
 import os

+from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR

 from ..models.utils import check_logprobs_close, check_outputs_equal
@@ -73,6 +74,12 @@ def test_multi_step_llm(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> 1 logprob returned.
    """
+    if current_platform.is_rocm() and \
+        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
+        pytest.skip(
+            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
+            "on ROCm")
+
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)

@@ -223,6 +230,9 @@ def test_multi_step_llm_w_prompt_logprobs(
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [None, 5])
 @pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
 def test_multi_step_llm_chunked_prefill_prefix_cache(
    vllm_runner,
    example_prompts,

--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -60,3 +60,15 @@ def test_hash_collision_array_shape():

    hasher = MultiModalHasher
    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_non_contiguous_array():
+    arr = np.arange(24).reshape(4, 6).T
+    assert not arr.flags.c_contiguous
+
+    arr_c = np.ascontiguousarray(arr)
+    assert arr_c.flags.c_contiguous
+
+    hasher = MultiModalHasher
+    # Both should be hashable and produce the same hashes
+    assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c)
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1086,6 +1086,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
        prompt="",
        mm_data={},
        mm_kwargs=call_kwargs,
+        tok_kwargs={},
    )

    assert out_kwargs == expected_kwargs
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -169,12 +169,15 @@ async def test_fetch_image_error_conversion():
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
 async def test_fetch_video_http(video_url: str, num_frames: int):
-    connector = MediaConnector()
+    connector = MediaConnector(
+        media_io_kwargs={"video": {
+            "num_frames": num_frames,
+        }})

-    video_sync = connector.fetch_video(video_url, num_frames=num_frames)
-    video_async = await connector.fetch_video_async(video_url,
-                                                    num_frames=num_frames)
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async


 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.

--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -4,7 +4,10 @@ import numpy as np
 import numpy.typing as npt
 import pytest

-from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
+from vllm import envs
+from vllm.multimodal.image import ImageMediaIO
+from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
+                                   VideoMediaIO)

 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -40,3 +43,46 @@ def test_video_loader_registry():
 def test_video_loader_type_doesnt_exist():
    with pytest.raises(AssertionError):
        VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
+
+
+@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
+class Assert10Frames1FPSVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: float = -1.0,
+                   **kwargs) -> npt.NDArray:
+        assert num_frames == 10, "bad num_frames"
+        assert fps == 1.0, "bad fps"
+        return FAKE_OUTPUT_2
+
+
+def test_video_media_io_kwargs():
+    envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps"
+    imageio = ImageMediaIO()
+
+    # Verify that different args pass/fail assertions as expected.
+    videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
+    _ = videoio.load_bytes(b"test")
+
+    videoio = VideoMediaIO(
+        imageio, **{
+            "num_frames": 10,
+            "fps": 1.0,
+            "not_used": "not_used"
+        })
+    _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad num_frames"):
+        videoio = VideoMediaIO(imageio, **{})
+        _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad num_frames"):
+        videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
+        _ = videoio.load_bytes(b"test")
+
+    with pytest.raises(AssertionError, match="bad fps"):
+        videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
+        _ = videoio.load_bytes(b"test")
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -7,6 +7,8 @@ import pytest
 import torch
 import torch.nn.functional as F

+from vllm.utils import cdiv
+

 class BlockDiagonalCausalFromBottomRightMask:

@@ -398,11 +400,8 @@ def test_contexted_kv_attention(
        assert (large_tile_size >= B_P_SIZE
                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"

-        def ceil_div(a, b):
-            return (a + b - 1) // b
-
        def pad_to_multiple(a, b):
-            return ceil_div(a, b) * b
+            return cdiv(a, b) * b

        def pad_to_next_power_of_2(a):
            assert a > 0
@@ -411,7 +410,7 @@ def test_contexted_kv_attention(
        # calculate input shapes
        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = cdiv(context_lens, block_size).sum().item()
        num_active_blocks = pad_to_multiple(num_active_blocks,
                                            large_tile_size // block_size)
        context_kv_len = num_active_blocks * block_size

--- a/tests/plugins/vllm_add_dummy_platform/setup.py
+++ b/tests/plugins/vllm_add_dummy_platform/setup.py
@@ -10,5 +10,7 @@ setup(
    entry_points={
        'vllm.platform_plugins': [
            "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin"  # noqa
-        ]
+        ],
+        "vllm.general_plugins":
+        ["dummy_custom_ops = vllm_add_dummy_platform:register_ops"],
    })
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -6,3 +6,7 @@ from typing import Optional

 def dummy_platform_plugin() -> Optional[str]:
    return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
+
+
+def register_ops():
+    import vllm_add_dummy_platform.dummy_custom_ops  # noqa