sync v0.15.1(tests)

78c1f9e5 · zhuwenwen · 86a65417 · 78c1f9e5 · 78c1f9e5 · 86a65417
Commit 78c1f9e5 authored Feb 05, 2026 by zhuwenwen
15 changed files
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = {
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
-        models=["CohereLabs/aya-vision-8b"],
+        models=[os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")],
+        models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
    ),
    "deepseek_vl_v2": VLMTestInfo(
-        models=["Isotr0py/deepseek-vl2-tiny"],  # model repo using dynamic module
+        models=[os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")],  # model repo using dynamic module
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ",  # noqa: E501
        max_model_len=4096,
@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
    "granite_vision": VLMTestInfo(
-        models=["ibm-granite/granite-vision-3.3-2b"],
+        models=[os.path.join(models_path_prefix, "ibm-granite/granite-vision-3.3-2b")],
        test_type=(VLMTestType.IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
        max_model_len=8192,
@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v-video": VLMTestInfo(
-        models=["zai-org/GLM-4.1V-9B-Thinking"],
+        models=[os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking")],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = {
        ],
        marks=[large_gpu_mark(min_gb=32)],
    ),
-    "glm_ocr": VLMTestInfo(
-        models=["zai-org/GLM-OCR"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
-        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
-        max_model_len=2048,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        num_logprobs=10,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
    "h2ovl": VLMTestInfo(
        models=[
            os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = {
    ),
    "intern_vl-video": VLMTestInfo(
        models=[
-            "OpenGVLab/InternVL3-1B",
+            os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B"),
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10 if current_platform.is_rocm() else 5,
    ),
    "intern_vl-hf": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
+        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = {
    ),
    "isaac": VLMTestInfo(
        models=[
-            "PerceptronAI/Isaac-0.1",
+            os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.1"),
-            "PerceptronAI/Isaac-0.2-2B-Preview",
+            os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.2-2B-Preview"),
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: (
@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    "kimi_vl": VLMTestInfo(
-        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        models=[os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>",  # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
-    "llama4": VLMTestInfo(
-        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda _: "<|image|>",
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        distributed_executor_backend="mp",
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        hf_model_kwargs={"device_map": "auto"},
-        max_model_len=8192,
-        max_num_seqs=4,
-        dtype="bfloat16",
-        auto_cls=AutoModelForImageTextToText,
-        tensor_parallel_size=4,
-        marks=multi_gpu_marks(num_gpus=4),
-    ),
    "llava_next": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.skip("HF import fails")],
    ),
    "minicpmo_26": VLMTestInfo(
-        models=["openbmb/MiniCPM-o-2_6"],
+        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "minimax_vl_01": VLMTestInfo(
-        models=["MiniMaxAI/MiniMax-VL-01"],
+        models=[os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01")],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>",  # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = {
        ],
    ),
    "molmo": VLMTestInfo(
-        models=["allenai/Molmo-7B-D-0924"],
+        models=[os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        max_model_len=4096,
@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
    ),
    "ovis1_6-gemma2": VLMTestInfo(
-        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "ovis2": VLMTestInfo(
-        models=["AIDC-AI/Ovis2-1B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
    "ovis2_5": VLMTestInfo(
-        models=["AIDC-AI/Ovis2.5-2B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = {
        hf_model_kwargs={"revision": "refs/pr/5"},
    ),
    "paddleocr_vl": VLMTestInfo(
-        models=["PaddlePaddle/PaddleOCR-VL"],
+        models=[os.path.join(models_path_prefix, "PaddlePaddle/PaddleOCR-VL")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        img_idx_to_prompt=lambda idx: (
@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = {
        ],
    ),
    "phi3v": VLMTestInfo(
-        models=["microsoft/Phi-3.5-vision-instruct"],
+        models=[os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = {
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
    ),
    "qwen2_vl": VLMTestInfo(
-        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.cpu_model],
    ),
    "skywork_r1v": VLMTestInfo(
-        models=["Skywork/Skywork-R1V-38B"],
+        models=[os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=80)],
    ),
    "smolvlm": VLMTestInfo(
-        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
+        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
    ),
    "tarsier": VLMTestInfo(
-        models=["omni-research/Tarsier-7b"],
+        models=[os.path.join(models_path_prefix, "omni-research/Tarsier-7b")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
        max_model_len=4096,
@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
    ),
    "tarsier2": VLMTestInfo(
-        models=["omni-research/Tarsier2-Recap-7b"],
+        models=[os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
-            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+            os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
        ),
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[
@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = {
    ),
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
-        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,

--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs
 using different ViT attention backends. Tests are parametrized by model and backend.
 """
+import os
 from dataclasses import asdict
 from typing import Any
@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
-from ....utils import create_new_process_for_each_test
+from ....utils import create_new_process_for_each_test, models_path_prefix
 from ...utils import dummy_hf_overrides
 # Dots.OCR prompt from official repository
@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
 # Model configurations
 MODEL_CONFIGS: dict[str, dict[str, Any]] = {
    "dots_ocr": {
-        "model_name": "rednote-hilab/dots.ocr",
+        "model_name": os.path.join(models_path_prefix, "rednote-hilab/dots.ocr"),
        "interface": "llm_chat",
        "max_model_len": 32768,
        "max_num_seqs": 1,
@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
    },
    "ernie45_vl": {
-        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        "model_name": os.path.join(models_path_prefix, "baidu/ERNIE-4.5-VL-28B-A3B-PT"),
        "interface": "llm_generate",
        "max_model_len": 16384,
        "max_num_seqs": 2,
@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "glm4_1v": {
-        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
+        "model_name": os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "use_processor": True,
        "question": "What is the content of each image?",
    },
-    "glm_ocr": {
-        "model_name": "zai-org/GLM-OCR",
-        "interface": "llm_generate",
-        "max_model_len": 131072,
-        "max_num_seqs": 2,
-        "sampling_params": {
-            "temperature": 0.0,
-            "max_tokens": 256,
-            "stop_token_ids": None,
-        },
-        "use_processor": True,
-        "question": "Text Recognition:",
-    },
    "keye_vl": {
-        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
+        "model_name": os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 5,
@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "ovis2_5": {
-        "model_name": "AIDC-AI/Ovis2.5-2B",
+        "model_name": os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"),
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 2,
@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "qwen2_5_vl": {
-        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
        "interface": "vllm_runner",
        "media_type": "video",
        "max_model_len": 4000,
@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        },
    },
    "qwen2_5_omni": {
-        "model_name": "Qwen/Qwen2.5-Omni-3B",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "qwen3_omni": {
-        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen3-Omni-30B-A3B-Instruct"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,

--- a/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+++ b/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import pytest
-from huggingface_hub.constants import HF_HUB_CACHE
-from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
-LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
-# Repo with multiple LoRAs contained in it
-LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
-LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"  # noqa: E501
-NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
-LIB_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
-)
-INVALID_REPO_NAME = "thisrepodoesnotexist"
-# Repo with only one LoRA in the root dir
-LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
-LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
-REPO_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
-)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_direct_path():
-    hf_resolver = HfHubResolver([LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
-    assert lora_request.lora_name == LORA_REPO
-    assert REPO_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_nested_paths():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_multiple_repos():
-    hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_missing_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
-    assert missing_lora_request is None
-@pytest.mark.asyncio
-async def test_nonlora_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    readme_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
-    )
-    assert readme_request is None
-@pytest.mark.asyncio
-async def test_invalid_repo():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    invalid_repo_req = await hf_resolver.resolve_lora(
-        INVALID_REPO_NAME,
-        f"{INVALID_REPO_NAME}/foo",
-    )
-    assert invalid_repo_req is None
-@pytest.mark.asyncio
-async def test_trailing_slash():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME,
-        f"{LORA_NAME}/",
-    )
-    assert lora_request is not None
-    assert lora_request.lora_name == f"{LORA_NAME}/"
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import os
-from vllm import SamplingParams
-from ..utils import models_path_prefix
-MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_ranks(
-    vllm_runner,
-    model,
-    dtype,
-    example_prompts,
-):
-    max_tokens = 5
-    num_top_logprobs = 5
-    num_prompt_logprobs = 5
-    with vllm_runner(model, dtype=dtype,
-                     max_logprobs=num_top_logprobs) as vllm_model:
-        ## Test greedy logprobs ranks
-        vllm_sampling_params = SamplingParams(
-            temperature=0.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
-        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                      vllm_sampling_params)
-        ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
-        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
-    for result in res:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks
-        for token, logprobs in zip(result[0], result[2]):
-            assert logprobs[token].rank >= 1
--- a/tests/test_access_log_filter.py
+++ b/tests/test_access_log_filter.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for the UvicornAccessLogFilter class.
-"""
-import logging
-from vllm.logging_utils.access_log_filter import (
-    UvicornAccessLogFilter,
-    create_uvicorn_log_config,
-)
-class TestUvicornAccessLogFilter:
-    """Test cases for UvicornAccessLogFilter."""
-    def test_filter_allows_all_when_no_excluded_paths(self):
-        """Filter should allow all logs when no paths are excluded."""
-        filter = UvicornAccessLogFilter(excluded_paths=[])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_allows_all_when_excluded_paths_is_none(self):
-        """Filter should allow all logs when excluded_paths is None."""
-        filter = UvicornAccessLogFilter(excluded_paths=None)
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_health_endpoint(self):
-        """Filter should exclude /health endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_excludes_metrics_endpoint(self):
-        """Filter should exclude /metrics endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_allows_non_excluded_endpoints(self):
-        """Filter should allow endpoints not in the excluded list."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_multiple_endpoints(self):
-        """Filter should exclude multiple configured endpoints."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
-        # Test /health
-        record_health = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_health) is False
-        # Test /metrics
-        record_metrics = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_metrics) is False
-        # Test /ping
-        record_ping = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_ping) is False
-    def test_filter_with_query_parameters(self):
-        """Filter should exclude endpoints even with query parameters."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_different_http_methods(self):
-        """Filter should exclude endpoints regardless of HTTP method."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
-        # Test GET
-        record_get = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_get) is False
-        # Test POST
-        record_post = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_post) is False
-    def test_filter_with_different_status_codes(self):
-        """Filter should exclude endpoints regardless of status code."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        for status_code in [200, 500, 503]:
-            record = logging.LogRecord(
-                name="uvicorn.access",
-                level=logging.INFO,
-                pathname="",
-                lineno=0,
-                msg='%s - "%s %s HTTP/%s" %d',
-                args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
-                exc_info=None,
-            )
-            assert filter.filter(record) is False
-class TestCreateUvicornLogConfig:
-    """Test cases for create_uvicorn_log_config function."""
-    def test_creates_valid_config_structure(self):
-        """Config should have required logging configuration keys."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "version" in config
-        assert config["version"] == 1
-        assert "disable_existing_loggers" in config
-        assert "formatters" in config
-        assert "handlers" in config
-        assert "loggers" in config
-        assert "filters" in config
-    def test_config_includes_access_log_filter(self):
-        """Config should include the access log filter."""
-        config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
-        assert "access_log_filter" in config["filters"]
-        filter_config = config["filters"]["access_log_filter"]
-        assert filter_config["()"] == UvicornAccessLogFilter
-        assert filter_config["excluded_paths"] == ["/health", "/metrics"]
-    def test_config_applies_filter_to_access_handler(self):
-        """Config should apply the filter to the access handler."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "access" in config["handlers"]
-        assert "filters" in config["handlers"]["access"]
-        assert "access_log_filter" in config["handlers"]["access"]["filters"]
-    def test_config_with_custom_log_level(self):
-        """Config should respect custom log level."""
-        config = create_uvicorn_log_config(
-            excluded_paths=["/health"], log_level="debug"
-        )
-        assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
-    def test_config_with_empty_excluded_paths(self):
-        """Config should work with empty excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=[])
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-    def test_config_with_none_excluded_paths(self):
-        """Config should work with None excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=None)
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-class TestIntegration:
-    """Integration tests for the access log filter."""
-    def test_filter_with_real_logger(self):
-        """Test filter works with a real Python logger simulating uvicorn."""
-        # Create a logger with our filter (simulating uvicorn.access)
-        logger = logging.getLogger("uvicorn.access")
-        logger.setLevel(logging.INFO)
-        # Clear any existing handlers
-        logger.handlers = []
-        # Create a custom handler that tracks messages
-        logged_messages: list[str] = []
-        class TrackingHandler(logging.Handler):
-            def emit(self, record):
-                logged_messages.append(record.getMessage())
-        handler = TrackingHandler()
-        handler.setLevel(logging.INFO)
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        handler.addFilter(filter)
-        logger.addHandler(handler)
-        # Log using uvicorn's format with args tuple
-        # Format: '%s - "%s %s HTTP/%s" %d'
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/health",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/v1/completions",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/metrics",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "POST",
-            "/v1/chat/completions",
-            "1.1",
-            200,
-        )
-        # Verify only non-excluded endpoints were logged
-        assert len(logged_messages) == 2
-        assert "/v1/completions" in logged_messages[0]
-        assert "/v1/chat/completions" in logged_messages[1]
-    def test_filter_allows_non_uvicorn_access_logs(self):
-        """Test filter allows logs from non-uvicorn.access loggers."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record from a different logger name
-        record = logging.LogRecord(
-            name="uvicorn.error",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some error message about /health",
-            args=(),
-            exc_info=None,
-        )
-        # Should allow because it's not from uvicorn.access
-        assert filter.filter(record) is True
-    def test_filter_handles_malformed_args(self):
-        """Test filter handles log records with unexpected args format."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with insufficient args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message",
-            args=("only", "two"),
-            exc_info=None,
-        )
-        # Should allow because args doesn't have expected format
-        assert filter.filter(record) is True
-    def test_filter_handles_non_tuple_args(self):
-        """Test filter handles log records with non-tuple args."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with None args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message without args",
-            args=None,
-            exc_info=None,
-        )
-        # Should allow because args is None
-        assert filter.filter(record) is True
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import os
-from transformers import PreTrainedTokenizerBase
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from ..utils import models_path_prefix
-# TOKENIZER_NAMES = [
-#     os.path.join(models_path_prefix, "facebook/opt-125m"),
-#     os.path.join(models_path_prefix, "gpt2"),
-# ]
-# export HF_ENDPOINT=https://hf-mirror.com
-TOKENIZER_NAMES = [
-    "facebook/opt-125m",
-    "gpt2",
-]
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
-    # Assume that "main" branch always exists
-    # tokenizer = get_tokenizer(tokenizer_name, revision="main")
-    tokenizer = get_tokenizer(tokenizer_name)
-    assert isinstance(tokenizer, PreTrainedTokenizerBase)
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_tokenizer(tokenizer_name, revision="never")
--- a/tests/tool_parsers/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
+import os
 import json
 import pytest
@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.glm4_moe_tool_parser import (
    Glm4MoeModelToolParser,
 )
+from ..utils import models_path_prefix
 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
-MODEL = "zai-org/GLM-4.5"
+MODEL = os.path.join(models_path_prefix, "zai-org/GLM-4.5")
 @pytest.fixture(scope="module")
@@ -225,6 +227,7 @@ def test_extract_tool_calls(
 def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser):
    """Test tool extraction when thinking tags are present."""
    model_output = """<think>I want to get the weather.</think>
 I will help you get the weather.
 <tool_call>get_weather
 <arg_key>city</arg_key>
@@ -242,6 +245,7 @@ I will help you get the weather.
    assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
    expected_content = """<think>I want to get the weather.</think>
 I will help you get the weather."""
    assert extracted_tool_calls.content == expected_content
@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
 def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser):
    """Test extraction with mixed content and multiple tool calls."""
    model_output = """I will help you get the weather info.
 <tool_call>get_weather
 <arg_key>city</arg_key>
 <arg_value>Beijing</arg_value>

--- a/tests/tpu/lora/untest_lora.py
+++ b/tests/tpu/lora/untest_lora.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from torch_xla._internal import tpu
-import vllm
-from vllm.lora.request import LoRARequest
-# This file contains tests to ensure that LoRA works correctly on the TPU
-# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
-# for this. The adapters are:
-# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
-# from 1 to 4.
-# These adapters are trained using a standard huggingface peft training script,
-# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
-# 100 training iterations with a training batch size of 100.
-def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
-    return vllm.LLM(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        max_model_len=256,
-        max_num_seqs=8,
-        tensor_parallel_size=tp,
-        enable_lora=True,
-        max_loras=num_loras,
-        max_lora_rank=8,
-    )
-TPU_TENSOR_PARALLEL_SIZES = (
-    [1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
-)
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_single_lora(tp: int):
-    """
-    This test ensures we can run a single LoRA adapter on the TPU backend.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=1.
-    """
-    llm = setup_vllm(1, tp)
-    prompt = "What is 1+1? \n"
-    lora_request = LoRARequest(
-        "lora_adapter_1",
-        1,
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
-    )
-    output = (
-        llm.generate(
-            prompt,
-            sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-            lora_request=lora_request,
-        )[0]
-        .outputs[0]
-        .text
-    )
-    answer = output.strip()[0]
-    assert answer.isdigit()
-    assert int(answer) == 1
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_lora_hotswapping(tp: int):
-    """
-    This test ensures we can run multiple LoRA adapters on the TPU backend, even
-    if we only have space to store 1.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
-    """
-    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
-    lora_requests = [
-        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
-        for i in range(1, 5)
-    ]
-    llm = setup_vllm(1, tp)
-    prompt = "What is 1+1? \n"
-    for i, req in enumerate(lora_requests):
-        output = (
-            llm.generate(
-                prompt,
-                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-                lora_request=req,
-            )[0]
-            .outputs[0]
-            .text
-        )
-        answer = output.strip()[0]
-        assert answer.isdigit()
-        assert int(answer) == i + 1
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_multi_lora(tp: int):
-    """
-    This test ensures we can run multiple LoRA adapters on the TPU backend, when
-    we have enough space to store all of them.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
-    """
-    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
-    lora_requests = [
-        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
-        for i in range(1, 5)
-    ]
-    llm = setup_vllm(4, tp)
-    prompt = "What is 1+1? \n"
-    for i, req in enumerate(lora_requests):
-        output = (
-            llm.generate(
-                prompt,
-                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-                lora_request=req,
-            )[0]
-            .outputs[0]
-            .text
-        )
-        answer = output.strip()[0]
-        assert answer.isdigit()
-        assert int(output.strip()[0]) == i + 1
--- a/tests/tpu/untest_compilation.py
+++ b/tests/tpu/untest_compilation.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import glob
-import os
-import tempfile
-import depyf
-def test_tpu_compilation():
-    temp_dir = tempfile.mkdtemp()
-    with depyf.prepare_debug(temp_dir):
-        from vllm import LLM, SamplingParams
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-            "The greatest glory in living lies not in never falling,",
-        ]
-        answers = [
-            " or, through inaction",
-            " what is essential ",
-            " but in rising ",
-        ]
-        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-        N = 1
-        sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
-        llm = LLM(
-            model="Qwen/Qwen2-1.5B-Instruct",
-            max_num_batched_tokens=256,
-            max_model_len=256,
-            max_num_seqs=32,
-            enforce_eager=False,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-        for output, answer in zip(outputs, answers):
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            assert generated_text.startswith(answer)
-    compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
-    )
-    for i, compiled_code in enumerate(compiled_codes):
-        print("{} file: {}".format(i + 1, compiled_code))
-    # We should only trigger Dynamo compilation 2 times:
-    # 1. Forward pass without kv_caches
-    # 2. Forward pass with kv_caches
-    # Check we have 2 compiled codes
-    assert len(compiled_codes) == 2
-    kv_cache_prefix = "kv_cache"
-    attn_prefix = "ragged_paged_attention"
-    def extract_compiled_index(s):
-        parts = s.replace(".", "_").split("_")
-        numbers = [int(part) for part in parts if part.isdigit()]
-        return numbers[0]
-    # Check all the compilations are as expected. The dump files include the
-    # captured graph for the forward function of the nn.Module.
-    compiled_fns = sorted(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
-        key=lambda s: extract_compiled_index(s),
-    )
-    for i, compiled_fn in enumerate(compiled_fns):
-        print("{} file: {}".format(i + 1, compiled_fn))
-    # The first compilation should not have any kv_caches
-    with open(compiled_fns[0]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-    # The second compilation should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix in content and attn_prefix in content
--- a/tests/tpu/untest_custom_dispatcher.py
+++ b/tests/tpu/untest_custom_dispatcher.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import pytest
-from vllm.config import CompilationMode
-from ..utils import compare_two_settings, models_path_prefix
-# --enforce-eager on TPU causes graph compilation
-# this times out default Health Check in the MQLLMEngine,
-# so we set the timeout here to 30s
-def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings(
-            os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
-            arg1=[
-                "--max-model-len=256",
-                "--max-num-seqs=32",
-                "--enforce-eager",
-                f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
-            ],
-            arg2=[
-                "--max-model-len=256",
-                "--max-num-seqs=32",
-                "--enforce-eager",
-                f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
-            ],
-            env1={},
-            env2={},
-        )
--- a/tests/tpu/untest_moe_pallas.py
+++ b/tests/tpu/untest_moe_pallas.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the Pallas MOE implementation.
-Run `pytest tests/kernels/moe/test_moe_pallas.py`.
-"""
-import pytest
-import torch
-import torch_xla
-from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
-from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as torch_moe,
-)
-from vllm.platforms import current_platform
-if not current_platform.is_tpu():
-    pytest.skip("This test needs a TPU.", allow_module_level=True)
-NUM_EXPERTS = [8, 64]
-EP_SIZE = [1]
-TOP_KS = [2, 6]
-# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
-@pytest.mark.parametrize("m", [8, 16, 64, 2048])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_pallas_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    ep_size: int,
-    dtype: torch.dtype,
-):
-    import torch_xla.core.xla_model as xm
-    with torch.device(xm.xla_device()):
-        a = torch.randn((m, k), dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), dtype=dtype) / 10
-        score = torch.randn((m, e), dtype=dtype)
-        # TODO: Support ep
-        if ep_size > 1:
-            pytest.skip("No support for ep_size > 1 yet")
-        else:
-            e_map = None
-        # Run both implementations
-        torch_output = torch_moe(
-            hidden_states=a,
-            w1=w1,
-            w2=w2,
-            gating_output=score,
-            topk=topk,
-            global_num_experts=e,
-            expert_map=e_map,
-            renormalize=False,
-        )
-        pallas_output = pallas_moe(
-            hidden_states=a,
-            w1=w1,
-            w2=w2,
-            gating_output=score,
-            topk=topk,
-            global_num_experts=e,
-            expert_map=e_map,
-            renormalize=False,
-        )
-        torch_xla.sync(wait=False)
-    # Compare outputs
-    torch.testing.assert_close(
-        pallas_output.cpu(),
-        torch_output.cpu(),
-        atol=2e-2,
-        rtol=0,
-    )
--- a/tests/tpu/untest_quantization_accuracy.py
+++ b/tests/tpu/untest_quantization_accuracy.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-import lm_eval
-import pytest
-TASK = "gsm8k"
-FILTER = "exact_match,strict-match"
-RTOL = 0.03
-@dataclass
-class GSM8KAccuracyTestConfig:
-    model_name: str
-    expected_value: float
-    def get_model_args(self) -> str:
-        return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
-# NOTE: Accuracy scores measured on GPUs.
-ACCURACY_CONFIGS = [
-    GSM8KAccuracyTestConfig(
-        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        expected_value=0.76,
-    ),  # no bias
-    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
-    # so only one of these tests can run in a single call to pytest. As
-    # a follow-up, move this into the LM-EVAL section of the CI.
-    # GSM8KAccuracyTestConfig(
-    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
-    #     expected_value=0.66),  # bias in QKV layers
-]
-@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
-def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=config.get_model_args(),
-        tasks="gsm8k",
-        batch_size="auto",
-    )
-    EXPECTED_VALUE = config.expected_value
-    measured_value = results["results"][TASK][FILTER]
-    assert (
-        measured_value - RTOL < EXPECTED_VALUE
-        and measured_value + RTOL > EXPECTED_VALUE
-    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 def make_output(scheduler: Scheduler):
    return ModelRunnerOutput(
        req_ids=[req.request_id for req in scheduler.running],

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -112,13 +112,6 @@ def create_vllm_config(
        enable_chunked_prefill=enable_chunked_prefill,
        is_encoder_decoder=model_config.is_encoder_decoder,
    )
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-        is_encoder_decoder=model_config.is_encoder_decoder,
-    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
        block_size=block_size,