sync v0.15.1

c721b814 · zhuwenwen · d53fe7e5 · c721b814 · c721b814 · c721b814
Commit c721b814 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -104,16 +104,18 @@ def test_flash_mla(
        descale_k = None
    def flash_mla():
-        return flash_mla_with_kvcache(q,
+        return flash_mla_with_kvcache(
-                                      blocked_k,
+            q,
-                                      block_table,
+            blocked_k,
-                                      cache_seqlens,
+            block_table,
-                                      dv,
+            cache_seqlens,
-                                      tile_scheduler_metadata,
+            dv,
-                                      num_splits,
+            tile_scheduler_metadata,
-                                      causal=causal,
+            num_splits,
-                                      descale_q=descale_q,
+            causal=causal,
-                                      descale_k=descale_k)
+            descale_q=descale_q,
+            descale_k=descale_k,
+        )
    def scaled_dot_product_attention(query, key, value, is_causal=False):
        query = query.float()

--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -22,9 +22,6 @@ from vllm.distributed import (
 )
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.all2all_utils import (
-    maybe_make_prepare_finalize,
-)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -43,6 +40,7 @@ from .mk_objects import (
    TestMoEQuantConfig,
    expert_info,
    make_fused_experts,
+    make_prepare_finalize,
    prepare_finalize_info,
 )
 from .parallel_utils import ProcessGroupInfo
@@ -605,10 +603,9 @@ def make_modular_kernel(
        routing_method=RoutingMethodType.DeepSeekV3,
    )
-    prepare_finalize = maybe_make_prepare_finalize(
+    # make modular kernel
-        moe=moe,
+    prepare_finalize = make_prepare_finalize(
-        quant_config=quant_config,
+        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
-        allow_new_interface=True,
    )
    assert prepare_finalize is not None

--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -7,6 +7,9 @@ import torch
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
 )
@@ -252,12 +255,13 @@ if has_pplx():
    )
 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
-    from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize,
-    )
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
        FlashInferExperts,
    )
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        FlashInferCutlassMoEPrepareAndFinalize,
+        create_flashinfer_prepare_finalize,
+    )
    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
    ]
+def make_prepare_finalize(
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    backend: str | None,
+    moe: FusedMoEConfig,
+    quant_config: FusedMoEQuantConfig,
+) -> mk.FusedMoEPrepareAndFinalize:
+    if backend != "naive" and backend is not None:
+        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
+        assert prepare_finalize is not None
+        return prepare_finalize
+    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
+        return create_flashinfer_prepare_finalize(
+            use_dp=moe.moe_parallel_config.dp_size > 1
+        )
+    else:
+        return MoEPrepareAndFinalizeNoEP()
 def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
    s = rank * num_local_experts
    e = s + num_local_experts

--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        )
        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
+                    moe_config=moe_config,
+                    quant_config=quant_config,
+                )
+            ),
            FlashInferExperts(
                moe_config=moe_config,
                quant_config=quant_config,

--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
        )
        flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
+                    moe_config=moe_config,
+                    quant_config=quant_config,
+                )
+            ),
            FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
        )

--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph(
        )
        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
                moe_config=make_dummy_moe_config(),
                quant_config=quant_config,

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import random
-import numpy as np
 import pytest
 import torch
 from transformers import AutoModelForTokenClassification
@@ -11,20 +9,6 @@ from tests.models.utils import softmax
 from vllm.platforms import current_platform
-@pytest.fixture(autouse=True)
-def seed_everything():
-    """Seed all random number generators for reproducibility."""
-    seed = 0
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    yield
 @pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
 # The float32 is required for this tiny model to pass the test.
 @pytest.mark.parametrize("dtype", ["float"])
@@ -68,7 +52,6 @@ def test_bert_models(
 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
 @pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.flaky(reruns=3)
 @torch.inference_mode
 def test_modernbert_models(
    hf_runner,
@@ -77,14 +60,6 @@ def test_modernbert_models(
    model: str,
    dtype: str,
 ) -> None:
-    # NOTE: https://github.com/vllm-project/vllm/pull/32403
-    # `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
-    # model, which can cause numerical precision variance and edge cases.
-    # We use @flaky(reruns=3) to mitigate intermittent failures.
-    print(
-        f"\n[NOTE] Testing {model} (randomly initialized weights) - "
-        "flaky tolerance enabled due to numerical precision variance."
-    )
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -458,20 +458,6 @@ VLM_TEST_SETTINGS = {
        ],
        marks=[large_gpu_mark(min_gb=32)],
    ),
-    "glm_ocr": VLMTestInfo(
-        models=["zai-org/GLM-OCR"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
-        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
-        max_model_len=2048,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        num_logprobs=10,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
    "h2ovl": VLMTestInfo(
        models=[
            "h2oai/h2ovl-mississippi-800m",
@@ -587,21 +573,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
-    "llama4": VLMTestInfo(
-        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda _: "<|image|>",
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        distributed_executor_backend="mp",
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        hf_model_kwargs={"device_map": "auto"},
-        max_model_len=8192,
-        max_num_seqs=4,
-        dtype="bfloat16",
-        auto_cls=AutoModelForImageTextToText,
-        tensor_parallel_size=4,
-        marks=multi_gpu_marks(num_gpus=4),
-    ),
    "llava_next": VLMTestInfo(
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),

--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -91,19 +91,6 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "use_processor": True,
        "question": "What is the content of each image?",
    },
-    "glm_ocr": {
-        "model_name": "zai-org/GLM-OCR",
-        "interface": "llm_generate",
-        "max_model_len": 131072,
-        "max_num_seqs": 2,
-        "sampling_params": {
-            "temperature": 0.0,
-            "max_tokens": 256,
-            "stop_token_ids": None,
-        },
-        "use_processor": True,
-        "question": "Text Recognition:",
-    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -122,7 +122,6 @@ MM_DATA_PATCHES = {
    "ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
    "glm4v": glm4_1v_patch_mm_data,
    "glm4v_moe": glm4_1v_patch_mm_data,
-    "glm_ocr": glm4_1v_patch_mm_data,
    "glmasr": glmasr_patch_mm_data,
    "molmo2": qwen3_vl_patch_mm_data,
    "qwen3_vl": qwen3_vl_patch_mm_data,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -256,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"),
    "ExaoneMoEForCausalLM": _HfExamplesInfo(
-        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.1.0"
+        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
    ),
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),
    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
@@ -273,7 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"),
    "Glm4MoeLiteForCausalLM": _HfExamplesInfo(
        "zai-org/GLM-4.7-Flash",
-        min_transformers_version="5.0.0",
+        min_transformers_version="5.0.0.dev",
+        is_available_online=False,
    ),
    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}),
    "GPTBigCodeForCausalLM": _HfExamplesInfo(
@@ -653,7 +654,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
+        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
@@ -696,7 +697,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "GlmAsrForConditionalGeneration": _HfExamplesInfo(
        "zai-org/GLM-ASR-Nano-2512",
        trust_remote_code=True,
-        min_transformers_version="5.0.0",
+        min_transformers_version="5.0",
    ),
    "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
@@ -709,11 +710,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),
    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"),
-    "GlmOcrForConditionalGeneration": _HfExamplesInfo(
-        "zai-org/GLM-OCR",
-        is_available_online=False,
-        min_transformers_version="5.1.0",
-    ),
    "H2OVLChatModel": _HfExamplesInfo(
        "h2oai/h2ovl-mississippi-800m",
        trust_remote_code=True,
@@ -1056,7 +1052,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "ExaoneMoeMTP": _HfExamplesInfo(
        "LGAI-EXAONE/K-EXAONE-236B-A23B",
        speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
-        min_transformers_version="5.1.0",
+        min_transformers_version="5.0.0",
    ),
    "Glm4MoeMTPModel": _HfExamplesInfo(
        "zai-org/GLM-4.5",
@@ -1067,12 +1063,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
        speculative_model="zai-org/GLM-4.7-Flash",
        min_transformers_version="5.0.0",
    ),
-    "GlmOcrMTPModel": _HfExamplesInfo(
-        "zai-org/GLM-OCR",
-        speculative_model="zai-org/GLM-OCR",
-        is_available_online=False,
-        min_transformers_version="5.1.0",
-    ),
    "LongCatFlashMTPModel": _HfExamplesInfo(
        "meituan-longcat/LongCat-Flash-Chat",
        trust_remote_code=True,
@@ -1104,27 +1094,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersEmbeddingModel": _HfExamplesInfo(
-        "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
+        "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
    ),
    "TransformersForSequenceClassification": _HfExamplesInfo(
        "papluca/xlm-roberta-base-language-detection",
-        min_transformers_version="5.0.0",
+        min_transformers_version="5.0.0.dev",
    ),
    "TransformersForCausalLM": _HfExamplesInfo(
        "hmellor/Ilama-3.2-1B", trust_remote_code=True
    ),
    "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
    "TransformersMoEForCausalLM": _HfExamplesInfo(
-        "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
+        "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
    ),
    "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
    ),
    "TransformersMoEEmbeddingModel": _HfExamplesInfo(
-        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
    ),
    "TransformersMoEForSequenceClassification": _HfExamplesInfo(
-        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
+        "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
    ),
    "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
    "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -88,6 +88,7 @@ def can_initialize(
            [10 * GiB_bytes],
        )
        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
        return 1, 0, scheduler_kv_cache_config

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -78,7 +78,7 @@ def test_models(
    from packaging.version import Version
    installed = Version(transformers.__version__)
-    required = Version("5.0.0")
+    required = Version("5.0.0.dev")
    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
        pytest.skip(
            "MoE models with the Transformers modeling backend require "

--- a/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+++ b/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import pytest
-from huggingface_hub.constants import HF_HUB_CACHE
-from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
-LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
-# Repo with multiple LoRAs contained in it
-LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
-LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"  # noqa: E501
-NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
-LIB_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
-)
-INVALID_REPO_NAME = "thisrepodoesnotexist"
-# Repo with only one LoRA in the root dir
-LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
-LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
-REPO_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
-)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_direct_path():
-    hf_resolver = HfHubResolver([LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
-    assert lora_request.lora_name == LORA_REPO
-    assert REPO_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_nested_paths():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_multiple_repos():
-    hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_missing_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
-    assert missing_lora_request is None
-@pytest.mark.asyncio
-async def test_nonlora_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    readme_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
-    )
-    assert readme_request is None
-@pytest.mark.asyncio
-async def test_invalid_repo():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    invalid_repo_req = await hf_resolver.resolve_lora(
-        INVALID_REPO_NAME,
-        f"{INVALID_REPO_NAME}/foo",
-    )
-    assert invalid_repo_req is None
-@pytest.mark.asyncio
-async def test_trailing_slash():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME,
-        f"{LORA_NAME}/",
-    )
-    assert lora_request is not None
-    assert lora_request.lora_name == f"{LORA_NAME}/"
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module):
    def forward(
        self,
-        input_ids: torch.Tensor | None,
+        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/tests/test_access_log_filter.py
+++ b/tests/test_access_log_filter.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for the UvicornAccessLogFilter class.
-"""
-import logging
-from vllm.logging_utils.access_log_filter import (
-    UvicornAccessLogFilter,
-    create_uvicorn_log_config,
-)
-class TestUvicornAccessLogFilter:
-    """Test cases for UvicornAccessLogFilter."""
-    def test_filter_allows_all_when_no_excluded_paths(self):
-        """Filter should allow all logs when no paths are excluded."""
-        filter = UvicornAccessLogFilter(excluded_paths=[])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_allows_all_when_excluded_paths_is_none(self):
-        """Filter should allow all logs when excluded_paths is None."""
-        filter = UvicornAccessLogFilter(excluded_paths=None)
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_health_endpoint(self):
-        """Filter should exclude /health endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_excludes_metrics_endpoint(self):
-        """Filter should exclude /metrics endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_allows_non_excluded_endpoints(self):
-        """Filter should allow endpoints not in the excluded list."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_multiple_endpoints(self):
-        """Filter should exclude multiple configured endpoints."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
-        # Test /health
-        record_health = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_health) is False
-        # Test /metrics
-        record_metrics = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_metrics) is False
-        # Test /ping
-        record_ping = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_ping) is False
-    def test_filter_with_query_parameters(self):
-        """Filter should exclude endpoints even with query parameters."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_different_http_methods(self):
-        """Filter should exclude endpoints regardless of HTTP method."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
-        # Test GET
-        record_get = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_get) is False
-        # Test POST
-        record_post = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_post) is False
-    def test_filter_with_different_status_codes(self):
-        """Filter should exclude endpoints regardless of status code."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        for status_code in [200, 500, 503]:
-            record = logging.LogRecord(
-                name="uvicorn.access",
-                level=logging.INFO,
-                pathname="",
-                lineno=0,
-                msg='%s - "%s %s HTTP/%s" %d',
-                args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
-                exc_info=None,
-            )
-            assert filter.filter(record) is False
-class TestCreateUvicornLogConfig:
-    """Test cases for create_uvicorn_log_config function."""
-    def test_creates_valid_config_structure(self):
-        """Config should have required logging configuration keys."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "version" in config
-        assert config["version"] == 1
-        assert "disable_existing_loggers" in config
-        assert "formatters" in config
-        assert "handlers" in config
-        assert "loggers" in config
-        assert "filters" in config
-    def test_config_includes_access_log_filter(self):
-        """Config should include the access log filter."""
-        config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
-        assert "access_log_filter" in config["filters"]
-        filter_config = config["filters"]["access_log_filter"]
-        assert filter_config["()"] == UvicornAccessLogFilter
-        assert filter_config["excluded_paths"] == ["/health", "/metrics"]
-    def test_config_applies_filter_to_access_handler(self):
-        """Config should apply the filter to the access handler."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "access" in config["handlers"]
-        assert "filters" in config["handlers"]["access"]
-        assert "access_log_filter" in config["handlers"]["access"]["filters"]
-    def test_config_with_custom_log_level(self):
-        """Config should respect custom log level."""
-        config = create_uvicorn_log_config(
-            excluded_paths=["/health"], log_level="debug"
-        )
-        assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
-    def test_config_with_empty_excluded_paths(self):
-        """Config should work with empty excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=[])
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-    def test_config_with_none_excluded_paths(self):
-        """Config should work with None excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=None)
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-class TestIntegration:
-    """Integration tests for the access log filter."""
-    def test_filter_with_real_logger(self):
-        """Test filter works with a real Python logger simulating uvicorn."""
-        # Create a logger with our filter (simulating uvicorn.access)
-        logger = logging.getLogger("uvicorn.access")
-        logger.setLevel(logging.INFO)
-        # Clear any existing handlers
-        logger.handlers = []
-        # Create a custom handler that tracks messages
-        logged_messages: list[str] = []
-        class TrackingHandler(logging.Handler):
-            def emit(self, record):
-                logged_messages.append(record.getMessage())
-        handler = TrackingHandler()
-        handler.setLevel(logging.INFO)
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        handler.addFilter(filter)
-        logger.addHandler(handler)
-        # Log using uvicorn's format with args tuple
-        # Format: '%s - "%s %s HTTP/%s" %d'
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/health",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/v1/completions",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/metrics",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "POST",
-            "/v1/chat/completions",
-            "1.1",
-            200,
-        )
-        # Verify only non-excluded endpoints were logged
-        assert len(logged_messages) == 2
-        assert "/v1/completions" in logged_messages[0]
-        assert "/v1/chat/completions" in logged_messages[1]
-    def test_filter_allows_non_uvicorn_access_logs(self):
-        """Test filter allows logs from non-uvicorn.access loggers."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record from a different logger name
-        record = logging.LogRecord(
-            name="uvicorn.error",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some error message about /health",
-            args=(),
-            exc_info=None,
-        )
-        # Should allow because it's not from uvicorn.access
-        assert filter.filter(record) is True
-    def test_filter_handles_malformed_args(self):
-        """Test filter handles log records with unexpected args format."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with insufficient args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message",
-            args=("only", "two"),
-            exc_info=None,
-        )
-        # Should allow because args doesn't have expected format
-        assert filter.filter(record) is True
-    def test_filter_handles_non_tuple_args(self):
-        """Test filter handles log records with non-tuple args."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with None args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message without args",
-            args=None,
-            exc_info=None,
-        )
-        # Should allow because args is None
-        assert filter.filter(record) is True
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -455,7 +455,7 @@ def test_eagle_correctness(
        from packaging.version import Version
        installed = Version(transformers.__version__)
-        required = Version("5.0.0")
+        required = Version("5.0.0.dev")
        if installed < required:
            pytest.skip(
                "Eagle3 with the Transformers modeling backend requires "

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -112,13 +112,6 @@ def create_vllm_config(
        enable_chunked_prefill=enable_chunked_prefill,
        is_encoder_decoder=model_config.is_encoder_decoder,
    )
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-        is_encoder_decoder=model_config.is_encoder_decoder,
-    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
        block_size=block_size,

--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -372,8 +372,6 @@ def test_load_model(
    all_indx_layers: dict[str, mock.MagicMock] = {}
-    all_indx_layers: dict[str, mock.MagicMock] = {}
    # Make mock_get_layers return different values for each call
    mock_get_layers.side_effect = [
        target_attn_layers,

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2831,13 +2831,13 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
 class CPUDNNLGEMMHandler:
    def __init__(self) -> None:
-        self.handler_tensor: torch.Tensor | None = None
+        self.handler: int | None = None
        self.n = -1
        self.k = -1
    def __del__(self):
-        if self.handler_tensor is not None:
+        if self.handler is not None:
-            torch.ops._C.release_dnnl_matmul_handler(self.handler_tensor.item())
+            torch.ops._C.release_dnnl_matmul_handler(self.handler)
 _supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler"))
@@ -2853,10 +2853,8 @@ def create_onednn_mm(
 ) -> CPUDNNLGEMMHandler:
    handler = CPUDNNLGEMMHandler()
    handler.k, handler.n = weight.size()
-    # store the handler pointer in a tensor it doesn't get inlined
+    handler.handler = torch.ops._C.create_onednn_mm_handler(
-    handler.handler_tensor = torch.tensor(
+        weight, primitive_cache_size
-        torch.ops._C.create_onednn_mm_handler(weight, primitive_cache_size),
-        dtype=torch.int64,
    )
    return handler
@@ -2884,17 +2882,8 @@ def create_onednn_scaled_mm(
 ) -> CPUDNNLGEMMHandler:
    handler = CPUDNNLGEMMHandler()
    handler.k, handler.n = weight.size()
-    # store the handler pointer in a tensor so it doesn't get inlined
+    handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
-    handler.handler_tensor = torch.tensor(
+        weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size
-        torch.ops._C.create_onednn_scaled_mm_handler(
-            weight,
-            weight_scales,
-            output_type,
-            dynamic_quant,
-            use_azp,
-            primitive_cache_size,
-        ),
-        dtype=torch.int64,
    )
    return handler
@@ -2947,13 +2936,7 @@ def onednn_scaled_mm(
    bias: torch.Tensor | None,
 ) -> torch.Tensor:
    torch.ops._C.onednn_scaled_mm(
-        output,
+        output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
-        x,
-        input_scale,
-        input_zp,
-        input_zp_adj,
-        bias,
-        dnnl_handler.handler_tensor,
    )
    return output