Merge tag 'v0.15.0rc1' into v0.15.0rc1-dev

d76fc11e · zhuwenwen · 38166ec4 · 58996f35 · d76fc11e · d76fc11e
Commit d76fc11e authored Jan 28, 2026 by zhuwenwen
20 changed files
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1182,6 +1182,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_step_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "stepfun-ai/Step3-VL-10B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"vision_config": {"enable_patch": False}},
+        trust_remote_code=True,
+        reasoning_parser="deepseek_r1",
+    )
+
+    prompt = (
+        "<｜begin▁of▁sentence｜> You are a helpful assistant.<|BOT|>user\n "
+        f"{'<im_patch>' * len(image_urls)}{question}<|EOT|><|BOT|>"
+        "assistant\n<think>\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

@@ -1374,6 +1400,7 @@ model_example_map = {
    "rvl": load_r_vl,
    "smolvlm": load_smolvlm,
    "step3": load_step3,
+    "stepvl": load_step_vl,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
    "glm4_5v": load_glm4_5v,

--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -157,6 +157,37 @@ VLLM_CONFIGURE_LOGGING=0 \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```

+### Example 4: Disable access logs for health check endpoints
+
+In production environments, health check endpoints like `/health`, `/metrics`,
+and `/ping` are frequently called by load balancers and monitoring systems,
+generating a large volume of repetitive access logs. To reduce log noise while
+keeping logs for other endpoints, use the `--disable-access-log-for-endpoints`
+option.
+
+**Disable access logs for health and metrics endpoints:**
+
+```bash
+vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 \
+    --disable-access-log-for-endpoints /health,/metrics,/ping
+```
+
+**Common endpoints to consider filtering:**
+
+| Endpoint   | Description            | Typical Caller                                       |
+| ---------- | ---------------------- | ---------------------------------------------------- |
+| `/health`  | Health check           | Kubernetes liveness/readiness probes, load balancers |
+| `/metrics` | Prometheus metrics     | Prometheus scraper (every 15-60s)                    |
+| `/ping`    | SageMaker health check | SageMaker infrastructure                             |
+| `/load`    | Server load metrics    | Custom monitoring                                    |
+
+**Notes:**
+
+- This option only affects uvicorn access logs, not vLLM application logs
+- Specify multiple endpoints by separating them with commas (no spaces)
+- The filter uses exact path matching, query parameters are ignored (e.g., `/health?verbose=true` matches `/health`)
+- If you need to completely disable all access logs, use `--disable-uvicorn-access-log` instead
+
 ## Additional resources

 - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ vllm = "vllm.entrypoints.cli.main:main"

 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
+lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"

 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm

--- a/tests/entrypoints/openai/responses/test_harmony.py
+++ b/tests/entrypoints/openai/responses/test_harmony.py
@@ -992,7 +992,7 @@ async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
    # First turn - make a calculation
    response1 = await client.responses.create(
        model=model_name,
-        input="Calculate 123 * 456 using python and print the result.",
+        input="Calculate 1234 * 4567 using python tool and print the result.",
        tools=tools,
        temperature=0.0,
        instructions=(

--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -42,6 +42,7 @@ class MockModelConfig:
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
+    hf_text_config = MockHFConfig()
    logits_processor_pattern = None
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -520,6 +520,7 @@ class MockModelConfig:
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
+    hf_text_config = MockHFConfig()
    logits_processors: list[str] | None = None
    logits_processor_pattern = None
    diff_sampling_param: dict | None = None

--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -22,6 +22,9 @@ from vllm.distributed import (
 )
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -40,7 +43,6 @@ from .mk_objects import (
    TestMoEQuantConfig,
    expert_info,
    make_fused_experts,
-    make_prepare_finalize,
    prepare_finalize_info,
 )
 from .parallel_utils import ProcessGroupInfo
@@ -603,10 +605,12 @@ def make_modular_kernel(
        routing_method=RoutingMethodType.DeepSeekV3,
    )

-    # make modular kernel
-    prepare_finalize = make_prepare_finalize(
-        config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
+    prepare_finalize = maybe_make_prepare_finalize(
+        moe=moe,
+        quant_config=quant_config,
+        allow_new_interface=True,
    )
+    assert prepare_finalize is not None

    fused_experts = make_fused_experts(
        config.fused_experts_type,

--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -7,9 +7,6 @@ import torch
 # Fused experts and PrepareFinalize imports
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe import TritonExperts
-from vllm.model_executor.layers.fused_moe.all2all_utils import (
-    maybe_make_prepare_finalize,
-)
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
 )
@@ -255,13 +252,12 @@ if has_pplx():
    )

 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
+    from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (  # noqa: E501
+        FlashInferCutlassMoEPrepareAndFinalize,
+    )
    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
        FlashInferExperts,
    )
-    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize,
-        create_flashinfer_prepare_finalize,
-    )

    register_prepare_and_finalize(
        FlashInferCutlassMoEPrepareAndFinalize,
@@ -429,24 +425,6 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
    ]


-def make_prepare_finalize(
-    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
-    backend: str | None,
-    moe: FusedMoEConfig,
-    quant_config: FusedMoEQuantConfig,
-) -> mk.FusedMoEPrepareAndFinalize:
-    if backend != "naive" and backend is not None:
-        prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
-        assert prepare_finalize is not None
-        return prepare_finalize
-    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
-        return create_flashinfer_prepare_finalize(
-            use_dp=moe.moe_parallel_config.dp_size > 1
-        )
-    else:
-        return MoEPrepareAndFinalizeNoEP()
-
-
 def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
    s = rank * num_local_experts
    e = s + num_local_experts

--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -294,12 +294,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(
-                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
-                    moe_config=moe_config,
-                    quant_config=quant_config,
-                )
-            ),
+            MoEPrepareAndFinalizeNoEP(),
            FlashInferExperts(
                moe_config=moe_config,
                quant_config=quant_config,

--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -106,12 +106,7 @@ def test_flashinfer_fp4_moe_no_graph(
        )

        flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(
-                defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
-                    moe_config=moe_config,
-                    quant_config=quant_config,
-                )
-            ),
+            MoEPrepareAndFinalizeNoEP(),
            FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
        )


--- a/tests/kernels/moe/untest_nvfp4_moe.py
+++ b/tests/kernels/moe/untest_nvfp4_moe.py
@@ -90,7 +90,7 @@ def test_cutlass_fp4_moe_no_graph(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp4(
                moe_config=make_dummy_moe_config(),
                quant_config=quant_config,

--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import numpy as np
 import pytest
 import torch
 from transformers import AutoModelForTokenClassification
@@ -8,6 +11,20 @@ from tests.models.utils import softmax
 from vllm.platforms import current_platform


+@pytest.fixture(autouse=True)
+def seed_everything():
+    """Seed all random number generators for reproducibility."""
+    seed = 0
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    yield
+
+
 @pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
 # The float32 is required for this tiny model to pass the test.
 @pytest.mark.parametrize("dtype", ["float"])
@@ -51,6 +68,7 @@ def test_bert_models(

 @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
 @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.flaky(reruns=3)
 @torch.inference_mode
 def test_modernbert_models(
    hf_runner,
@@ -59,6 +77,15 @@ def test_modernbert_models(
    model: str,
    dtype: str,
 ) -> None:
+    # NOTE: https://github.com/vllm-project/vllm/pull/32403
+    # `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
+    # model, which can cause numerical precision variance and edge cases.
+    # We use @flaky(reruns=3) to mitigate intermittent failures.
+    print(
+        f"\n[NOTE] Testing {model} (randomly initialized weights) - "
+        "flaky tolerance enabled due to numerical precision variance."
+    )
+
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)


--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -461,6 +461,20 @@ VLM_TEST_SETTINGS = {
        ],
        marks=[large_gpu_mark(min_gb=32)],
    ),
+    "glm_ocr": VLMTestInfo(
+        models=["zai-org/GLM-OCR"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
    "h2ovl": VLMTestInfo(
        models=[
            os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),

--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -91,6 +91,19 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "use_processor": True,
        "question": "What is the content of each image?",
    },
+    "glm_ocr": {
+        "model_name": "zai-org/GLM-OCR",
+        "interface": "llm_generate",
+        "max_model_len": 131072,
+        "max_num_seqs": 2,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_tokens": 256,
+            "stop_token_ids": None,
+        },
+        "use_processor": True,
+        "question": "Text Recognition:",
+    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -124,6 +124,7 @@ MM_DATA_PATCHES = {
    "ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
    "glm4v": glm4_1v_patch_mm_data,
    "glm4v_moe": glm4_1v_patch_mm_data,
+    "glm_ocr": glm4_1v_patch_mm_data,
    "glmasr": glmasr_patch_mm_data,
    "molmo2": qwen3_vl_patch_mm_data,
    "qwen3_vl": qwen3_vl_patch_mm_data,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -260,7 +260,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")),
    "ExaoneMoEForCausalLM": _HfExamplesInfo(
-        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.0.0"
+        "LGAI-EXAONE/K-EXAONE-236B-A23B", min_transformers_version="5.1.0"
    ),
    "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")),
    "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
@@ -277,8 +277,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Glm4MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "zai-org/GLM-4.5")),
    "Glm4MoeLiteForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "zai-org/GLM-4.7-Flash"),
-        min_transformers_version="5.0.0.dev",
-        is_available_online=False,
+        min_transformers_version="5.0.0",
    ),
    "GPT2LMHeadModel": _HfExamplesInfo(os.path.join(models_path_prefix, "openai-community/gpt2"), {"alias": os.path.join(models_path_prefix, "gpt2")}),
    "GPTBigCodeForCausalLM": _HfExamplesInfo(
@@ -659,7 +658,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "rhymes-ai/Aria")),
    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "nvidia/audio-flamingo-3-hf"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "nvidia/audio-flamingo-3-hf"), min_transformers_version="5.0.0"
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")),
    "BagelForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "ByteDance-Seed/BAGEL-7B-MoT")),
@@ -702,7 +701,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "GlmAsrForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "zai-org/GLM-ASR-Nano-2512"),
        trust_remote_code=True,
-        min_transformers_version="5.0",
+        min_transformers_version="5.0.0",
    ),
    "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
@@ -715,6 +714,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking")),
    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "zai-org/GLM-4.5V")),
+    "GlmOcrForConditionalGeneration": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "zai-org/GLM-OCR"),
+        is_available_online=False,
+        min_transformers_version="5.1.0",
+    ),
    "H2OVLChatModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
        trust_remote_code=True,
@@ -779,6 +783,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            )
        },
    ),
+    "KimiK25ForConditionalGeneration": _HfExamplesInfo(
+        "moonshotai/Kimi-K2.5",
+        trust_remote_code=True,
+        is_available_online=False,
+    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        os.path.join(models_path_prefix, "lightonai/LightOnOCR-1B-1025")
    ),
@@ -1052,7 +1061,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "ExaoneMoeMTP": _HfExamplesInfo(
        "LGAI-EXAONE/K-EXAONE-236B-A23B",
        speculative_model="LGAI-EXAONE/K-EXAONE-236B-A23B",
-        min_transformers_version="5.0.0",
+        min_transformers_version="5.1.0",
    ),
    "Glm4MoeMTPModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, "zai-org/GLM-4.5"),
@@ -1061,7 +1070,13 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "Glm4MoeLiteMTPModel": _HfExamplesInfo(
        "zai-org/GLM-4.7-Flash",
        speculative_model="zai-org/GLM-4.7-Flash",
+        min_transformers_version="5.0.0",
+    ),
+    "GlmOcrMTPModel": _HfExamplesInfo(
+        "zai-org/GLM-OCR",
+        speculative_model="zai-org/GLM-OCR",
        is_available_online=False,
+        min_transformers_version="5.1.0",
    ),
    "LongCatFlashMTPModel": _HfExamplesInfo(
        os.path.join(models_path_prefix, os.path.join(models_path_prefix, "meituan-longcat/LongCat-Flash-Chat")),
@@ -1088,27 +1103,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {

 _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersEmbeddingModel": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), min_transformers_version="5.0.0"
    ),
    "TransformersForSequenceClassification": _HfExamplesInfo(
        os.path.join(models_path_prefix, "papluca/xlm-roberta-base-language-detection"),
-        min_transformers_version="5.0.0.dev",
+        min_transformers_version="5.0.0",
    ),
    "TransformersForCausalLM": _HfExamplesInfo(
        os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), trust_remote_code=True
    ),
    "TransformersMultiModalForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/Emu3-Chat-hf")),
    "TransformersMoEForCausalLM": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924"), min_transformers_version="5.0.0"
    ),
    "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "Qwen/Qwen3-VL-30B-A3B-Instruct"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "Qwen/Qwen3-VL-30B-A3B-Instruct"), min_transformers_version="5.0.0"
    ),
    "TransformersMoEEmbeddingModel": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B"), min_transformers_version="5.0.0"
    ),
    "TransformersMoEForSequenceClassification": _HfExamplesInfo(
-        os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B"), min_transformers_version="5.0.0.dev"
+        os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B"), min_transformers_version="5.0.0"
    ),
    "TransformersMultiModalEmbeddingModel": _HfExamplesInfo(os.path.join(models_path_prefix, "google/gemma-3-4b-it")),
    "TransformersMultiModalForSequenceClassification": _HfExamplesInfo(

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -80,7 +80,7 @@ def test_models(
    from packaging.version import Version

    installed = Version(transformers.__version__)
-    required = Version("5.0.0.dev")
+    required = Version("5.0.0")
    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
        pytest.skip(
            "MoE models with the Transformers modeling backend require "

--- a/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+++ b/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+from huggingface_hub.constants import HF_HUB_CACHE
+
+from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
+
+LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
+# Repo with multiple LoRAs contained in it
+LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
+LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"  # noqa: E501
+NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
+LIB_DOWNLOAD_DIR = os.path.join(
+    HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
+)
+INVALID_REPO_NAME = "thisrepodoesnotexist"
+
+# Repo with only one LoRA in the root dir
+LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
+LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
+REPO_DOWNLOAD_DIR = os.path.join(
+    HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
+)
+
+
+@pytest.mark.asyncio
+async def test_hf_resolver_with_direct_path():
+    hf_resolver = HfHubResolver([LORA_REPO])
+    assert hf_resolver is not None
+
+    lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
+    assert lora_request.lora_name == LORA_REPO
+    assert REPO_DOWNLOAD_DIR in lora_request.lora_path
+    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
+
+
+@pytest.mark.asyncio
+async def test_hf_resolver_with_nested_paths():
+    hf_resolver = HfHubResolver([LORA_LIB])
+    assert hf_resolver is not None
+
+    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
+    assert lora_request is not None
+    assert lora_request.lora_name == LORA_NAME
+    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
+    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
+
+
+@pytest.mark.asyncio
+async def test_hf_resolver_with_multiple_repos():
+    hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
+    assert hf_resolver is not None
+
+    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
+    assert lora_request is not None
+    assert lora_request.lora_name == LORA_NAME
+    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
+    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
+
+
+@pytest.mark.asyncio
+async def test_missing_adapter():
+    hf_resolver = HfHubResolver([LORA_LIB])
+    assert hf_resolver is not None
+
+    missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
+    assert missing_lora_request is None
+
+
+@pytest.mark.asyncio
+async def test_nonlora_adapter():
+    hf_resolver = HfHubResolver([LORA_LIB])
+    assert hf_resolver is not None
+
+    readme_request = await hf_resolver.resolve_lora(
+        LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
+    )
+    assert readme_request is None
+
+
+@pytest.mark.asyncio
+async def test_invalid_repo():
+    hf_resolver = HfHubResolver([LORA_LIB])
+    assert hf_resolver is not None
+
+    invalid_repo_req = await hf_resolver.resolve_lora(
+        INVALID_REPO_NAME,
+        f"{INVALID_REPO_NAME}/foo",
+    )
+    assert invalid_repo_req is None
+
+
+@pytest.mark.asyncio
+async def test_trailing_slash():
+    hf_resolver = HfHubResolver([LORA_LIB])
+    assert hf_resolver is not None
+
+    lora_request = await hf_resolver.resolve_lora(
+        LORA_LIB_MODEL_NAME,
+        f"{LORA_NAME}/",
+    )
+    assert lora_request is not None
+    assert lora_request.lora_name == f"{LORA_NAME}/"
+    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
+    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -36,7 +36,7 @@ class MyGemma2Embedding(nn.Module):

    def forward(
        self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,

--- a/tests/test_access_log_filter.py
+++ b/tests/test_access_log_filter.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for the UvicornAccessLogFilter class.
+"""
+
+import logging
+
+from vllm.logging_utils.access_log_filter import (
+    UvicornAccessLogFilter,
+    create_uvicorn_log_config,
+)
+
+
+class TestUvicornAccessLogFilter:
+    """Test cases for UvicornAccessLogFilter."""
+
+    def test_filter_allows_all_when_no_excluded_paths(self):
+        """Filter should allow all logs when no paths are excluded."""
+        filter = UvicornAccessLogFilter(excluded_paths=[])
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is True
+
+    def test_filter_allows_all_when_excluded_paths_is_none(self):
+        """Filter should allow all logs when excluded_paths is None."""
+        filter = UvicornAccessLogFilter(excluded_paths=None)
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is True
+
+    def test_filter_excludes_health_endpoint(self):
+        """Filter should exclude /health endpoint when configured."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is False
+
+    def test_filter_excludes_metrics_endpoint(self):
+        """Filter should exclude /metrics endpoint when configured."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is False
+
+    def test_filter_allows_non_excluded_endpoints(self):
+        """Filter should allow endpoints not in the excluded list."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is True
+
+    def test_filter_excludes_multiple_endpoints(self):
+        """Filter should exclude multiple configured endpoints."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
+
+        # Test /health
+        record_health = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
+            exc_info=None,
+        )
+        assert filter.filter(record_health) is False
+
+        # Test /metrics
+        record_metrics = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
+            exc_info=None,
+        )
+        assert filter.filter(record_metrics) is False
+
+        # Test /ping
+        record_ping = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
+            exc_info=None,
+        )
+        assert filter.filter(record_ping) is False
+
+    def test_filter_with_query_parameters(self):
+        """Filter should exclude endpoints even with query parameters."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
+            exc_info=None,
+        )
+
+        assert filter.filter(record) is False
+
+    def test_filter_different_http_methods(self):
+        """Filter should exclude endpoints regardless of HTTP method."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
+
+        # Test GET
+        record_get = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
+            exc_info=None,
+        )
+        assert filter.filter(record_get) is False
+
+        # Test POST
+        record_post = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg='%s - "%s %s HTTP/%s" %d',
+            args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
+            exc_info=None,
+        )
+        assert filter.filter(record_post) is False
+
+    def test_filter_with_different_status_codes(self):
+        """Filter should exclude endpoints regardless of status code."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        for status_code in [200, 500, 503]:
+            record = logging.LogRecord(
+                name="uvicorn.access",
+                level=logging.INFO,
+                pathname="",
+                lineno=0,
+                msg='%s - "%s %s HTTP/%s" %d',
+                args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
+                exc_info=None,
+            )
+            assert filter.filter(record) is False
+
+
+class TestCreateUvicornLogConfig:
+    """Test cases for create_uvicorn_log_config function."""
+
+    def test_creates_valid_config_structure(self):
+        """Config should have required logging configuration keys."""
+        config = create_uvicorn_log_config(excluded_paths=["/health"])
+
+        assert "version" in config
+        assert config["version"] == 1
+        assert "disable_existing_loggers" in config
+        assert "formatters" in config
+        assert "handlers" in config
+        assert "loggers" in config
+        assert "filters" in config
+
+    def test_config_includes_access_log_filter(self):
+        """Config should include the access log filter."""
+        config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
+
+        assert "access_log_filter" in config["filters"]
+        filter_config = config["filters"]["access_log_filter"]
+        assert filter_config["()"] == UvicornAccessLogFilter
+        assert filter_config["excluded_paths"] == ["/health", "/metrics"]
+
+    def test_config_applies_filter_to_access_handler(self):
+        """Config should apply the filter to the access handler."""
+        config = create_uvicorn_log_config(excluded_paths=["/health"])
+
+        assert "access" in config["handlers"]
+        assert "filters" in config["handlers"]["access"]
+        assert "access_log_filter" in config["handlers"]["access"]["filters"]
+
+    def test_config_with_custom_log_level(self):
+        """Config should respect custom log level."""
+        config = create_uvicorn_log_config(
+            excluded_paths=["/health"], log_level="debug"
+        )
+
+        assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
+        assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
+        assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
+
+    def test_config_with_empty_excluded_paths(self):
+        """Config should work with empty excluded paths."""
+        config = create_uvicorn_log_config(excluded_paths=[])
+
+        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
+
+    def test_config_with_none_excluded_paths(self):
+        """Config should work with None excluded paths."""
+        config = create_uvicorn_log_config(excluded_paths=None)
+
+        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
+
+
+class TestIntegration:
+    """Integration tests for the access log filter."""
+
+    def test_filter_with_real_logger(self):
+        """Test filter works with a real Python logger simulating uvicorn."""
+        # Create a logger with our filter (simulating uvicorn.access)
+        logger = logging.getLogger("uvicorn.access")
+        logger.setLevel(logging.INFO)
+
+        # Clear any existing handlers
+        logger.handlers = []
+
+        # Create a custom handler that tracks messages
+        logged_messages: list[str] = []
+
+        class TrackingHandler(logging.Handler):
+            def emit(self, record):
+                logged_messages.append(record.getMessage())
+
+        handler = TrackingHandler()
+        handler.setLevel(logging.INFO)
+        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
+        handler.addFilter(filter)
+        logger.addHandler(handler)
+
+        # Log using uvicorn's format with args tuple
+        # Format: '%s - "%s %s HTTP/%s" %d'
+        logger.info(
+            '%s - "%s %s HTTP/%s" %d',
+            "127.0.0.1:12345",
+            "GET",
+            "/health",
+            "1.1",
+            200,
+        )
+        logger.info(
+            '%s - "%s %s HTTP/%s" %d',
+            "127.0.0.1:12345",
+            "GET",
+            "/v1/completions",
+            "1.1",
+            200,
+        )
+        logger.info(
+            '%s - "%s %s HTTP/%s" %d',
+            "127.0.0.1:12345",
+            "GET",
+            "/metrics",
+            "1.1",
+            200,
+        )
+        logger.info(
+            '%s - "%s %s HTTP/%s" %d',
+            "127.0.0.1:12345",
+            "POST",
+            "/v1/chat/completions",
+            "1.1",
+            200,
+        )
+
+        # Verify only non-excluded endpoints were logged
+        assert len(logged_messages) == 2
+        assert "/v1/completions" in logged_messages[0]
+        assert "/v1/chat/completions" in logged_messages[1]
+
+    def test_filter_allows_non_uvicorn_access_logs(self):
+        """Test filter allows logs from non-uvicorn.access loggers."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        # Log record from a different logger name
+        record = logging.LogRecord(
+            name="uvicorn.error",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="Some error message about /health",
+            args=(),
+            exc_info=None,
+        )
+
+        # Should allow because it's not from uvicorn.access
+        assert filter.filter(record) is True
+
+    def test_filter_handles_malformed_args(self):
+        """Test filter handles log records with unexpected args format."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        # Log record with insufficient args
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="Some message",
+            args=("only", "two"),
+            exc_info=None,
+        )
+
+        # Should allow because args doesn't have expected format
+        assert filter.filter(record) is True
+
+    def test_filter_handles_non_tuple_args(self):
+        """Test filter handles log records with non-tuple args."""
+        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
+
+        # Log record with None args
+        record = logging.LogRecord(
+            name="uvicorn.access",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="Some message without args",
+            args=None,
+            exc_info=None,
+        )
+
+        # Should allow because args is None
+        assert filter.filter(record) is True