sync v0.15.1(tests)

78c1f9e5 · zhuwenwen · 86a65417 · 78c1f9e5 · 78c1f9e5 · 86a65417
Commit 78c1f9e5 authored Feb 05, 2026 by zhuwenwen
15 changed files
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = {
        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
    ),
    "aya_vision-multi_image": VLMTestInfo(
-        models=["CohereLabs/aya-vision-8b"],
+        models=[os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")],
        test_type=(VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "blip2": VLMTestInfo(
-        models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")],
+        models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
        img_idx_to_prompt=lambda idx: "",
@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
    ),
    "deepseek_vl_v2": VLMTestInfo(
-        models=["Isotr0py/deepseek-vl2-tiny"],  # model repo using dynamic module
+        models=[os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")],  # model repo using dynamic module
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ",  # noqa: E501
        max_model_len=4096,
@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
    ),
    "granite_vision": VLMTestInfo(
-        models=["ibm-granite/granite-vision-3.3-2b"],
+        models=[os.path.join(models_path_prefix, "ibm-granite/granite-vision-3.3-2b")],
        test_type=(VLMTestType.IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
        max_model_len=8192,
@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v-video": VLMTestInfo(
-        models=["zai-org/GLM-4.1V-9B-Thinking"],
+        models=[os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking")],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = {
        ],
        marks=[large_gpu_mark(min_gb=32)],
    ),
-    "glm_ocr": VLMTestInfo(
-        models=["zai-org/GLM-OCR"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
-        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
-        max_model_len=2048,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        num_logprobs=10,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=32)],
-    ),
    "h2ovl": VLMTestInfo(
        models=[
            os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = {
    ),
    "intern_vl-video": VLMTestInfo(
        models=[
-            "OpenGVLab/InternVL3-1B",
+            os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B"),
        ],
        test_type=VLMTestType.VIDEO,
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10 if current_platform.is_rocm() else 5,
    ),
    "intern_vl-hf": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
+        models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = {
    ),
    "isaac": VLMTestInfo(
        models=[
-            "PerceptronAI/Isaac-0.1",
+            os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.1"),
-            "PerceptronAI/Isaac-0.2-2B-Preview",
+            os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.2-2B-Preview"),
        ],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: (
@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    "kimi_vl": VLMTestInfo(
-        models=["moonshotai/Kimi-VL-A3B-Instruct"],
+        models=[os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>",  # noqa: E501
        img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>",  # noqa: E501
@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
        marks=[large_gpu_mark(min_gb=48)],
    ),
-    "llama4": VLMTestInfo(
-        models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda _: "<|image|>",
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        distributed_executor_backend="mp",
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        hf_model_kwargs={"device_map": "auto"},
-        max_model_len=8192,
-        max_num_seqs=4,
-        dtype="bfloat16",
-        auto_cls=AutoModelForImageTextToText,
-        tensor_parallel_size=4,
-        marks=multi_gpu_marks(num_gpus=4),
-    ),
    "llava_next": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
        test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.skip("HF import fails")],
    ),
    "minicpmo_26": VLMTestInfo(
-        models=["openbmb/MiniCPM-o-2_6"],
+        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
    ),
    "minimax_vl_01": VLMTestInfo(
-        models=["MiniMaxAI/MiniMax-VL-01"],
+        models=[os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01")],
        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>",  # noqa: E501
        img_idx_to_prompt=lambda _: "<image>",
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = {
        ],
    ),
    "molmo": VLMTestInfo(
-        models=["allenai/Molmo-7B-D-0924"],
+        models=[os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,
        max_model_len=4096,
@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.molmo_patch_hf_runner,
    ),
    "ovis1_6-gemma2": VLMTestInfo(
-        models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "ovis2": VLMTestInfo(
-        models=["AIDC-AI/Ovis2-1B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.ovis_patch_hf_runner,
    ),
    "ovis2_5": VLMTestInfo(
-        models=["AIDC-AI/Ovis2.5-2B"],
+        models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>\n",
@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = {
        hf_model_kwargs={"revision": "refs/pr/5"},
    ),
    "paddleocr_vl": VLMTestInfo(
-        models=["PaddlePaddle/PaddleOCR-VL"],
+        models=[os.path.join(models_path_prefix, "PaddlePaddle/PaddleOCR-VL")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        img_idx_to_prompt=lambda idx: (
@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = {
        ],
    ),
    "phi3v": VLMTestInfo(
-        models=["microsoft/Phi-3.5-vision-instruct"],
+        models=[os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = {
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
    ),
    "qwen2_vl": VLMTestInfo(
-        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = {
        marks=[pytest.mark.cpu_model],
    ),
    "skywork_r1v": VLMTestInfo(
-        models=["Skywork/Skywork-R1V-38B"],
+        models=[os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = {
        marks=[large_gpu_mark(min_gb=80)],
    ),
    "smolvlm": VLMTestInfo(
-        models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
+        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
    ),
    "tarsier": VLMTestInfo(
-        models=["omni-research/Tarsier-7b"],
+        models=[os.path.join(models_path_prefix, "omni-research/Tarsier-7b")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
        max_model_len=4096,
@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
    ),
    "tarsier2": VLMTestInfo(
-        models=["omni-research/Tarsier2-Recap-7b"],
+        models=[os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")],
        test_type=(
            VLMTestType.IMAGE,
            VLMTestType.MULTI_IMAGE,
@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
-            "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+            os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
        ),
        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
        custom_test_opts=[
@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = {
    ),
    # regression test for https://github.com/vllm-project/vllm/issues/15122
    "qwen2_5_vl-windows-attention": VLMTestInfo(
-        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        max_num_seqs=2,

--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs
 using different ViT attention backends. Tests are parametrized by model and backend.
 """
+import os
 from dataclasses import asdict
 from typing import Any
@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
-from ....utils import create_new_process_for_each_test
+from ....utils import create_new_process_for_each_test, models_path_prefix
 from ...utils import dummy_hf_overrides
 # Dots.OCR prompt from official repository
@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
 # Model configurations
 MODEL_CONFIGS: dict[str, dict[str, Any]] = {
    "dots_ocr": {
-        "model_name": "rednote-hilab/dots.ocr",
+        "model_name": os.path.join(models_path_prefix, "rednote-hilab/dots.ocr"),
        "interface": "llm_chat",
        "max_model_len": 32768,
        "max_num_seqs": 1,
@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
    },
    "ernie45_vl": {
-        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
+        "model_name": os.path.join(models_path_prefix, "baidu/ERNIE-4.5-VL-28B-A3B-PT"),
        "interface": "llm_generate",
        "max_model_len": 16384,
        "max_num_seqs": 2,
@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "glm4_1v": {
-        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
+        "model_name": os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "use_processor": True,
        "question": "What is the content of each image?",
    },
-    "glm_ocr": {
-        "model_name": "zai-org/GLM-OCR",
-        "interface": "llm_generate",
-        "max_model_len": 131072,
-        "max_num_seqs": 2,
-        "sampling_params": {
-            "temperature": 0.0,
-            "max_tokens": 256,
-            "stop_token_ids": None,
-        },
-        "use_processor": True,
-        "question": "Text Recognition:",
-    },
    "keye_vl": {
-        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
+        "model_name": os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 5,
@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "ovis2_5": {
-        "model_name": "AIDC-AI/Ovis2.5-2B",
+        "model_name": os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"),
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 2,
@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "qwen2_5_vl": {
-        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
        "interface": "vllm_runner",
        "media_type": "video",
        "max_model_len": 4000,
@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        },
    },
    "qwen2_5_omni": {
-        "model_name": "Qwen/Qwen2.5-Omni-3B",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
        "question": "What is the content of each image?",
    },
    "qwen3_omni": {
-        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen3-Omni-30B-A3B-Instruct"),
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,

--- a/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
+++ b/tests/plugins/lora_resolvers/test_hf_hub_resolver.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import pytest
-from huggingface_hub.constants import HF_HUB_CACHE
-from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
-LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
-# Repo with multiple LoRAs contained in it
-LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
-LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora"  # noqa: E501
-NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
-LIB_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
-)
-INVALID_REPO_NAME = "thisrepodoesnotexist"
-# Repo with only one LoRA in the root dir
-LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
-LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
-REPO_DOWNLOAD_DIR = os.path.join(
-    HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
-)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_direct_path():
-    hf_resolver = HfHubResolver([LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
-    assert lora_request.lora_name == LORA_REPO
-    assert REPO_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_nested_paths():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_hf_resolver_with_multiple_repos():
-    hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
-    assert lora_request is not None
-    assert lora_request.lora_name == LORA_NAME
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
-@pytest.mark.asyncio
-async def test_missing_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
-    assert missing_lora_request is None
-@pytest.mark.asyncio
-async def test_nonlora_adapter():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    readme_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
-    )
-    assert readme_request is None
-@pytest.mark.asyncio
-async def test_invalid_repo():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    invalid_repo_req = await hf_resolver.resolve_lora(
-        INVALID_REPO_NAME,
-        f"{INVALID_REPO_NAME}/foo",
-    )
-    assert invalid_repo_req is None
-@pytest.mark.asyncio
-async def test_trailing_slash():
-    hf_resolver = HfHubResolver([LORA_LIB])
-    assert hf_resolver is not None
-    lora_request = await hf_resolver.resolve_lora(
-        LORA_LIB_MODEL_NAME,
-        f"{LORA_NAME}/",
-    )
-    assert lora_request is not None
-    assert lora_request.lora_name == f"{LORA_NAME}/"
-    assert LIB_DOWNLOAD_DIR in lora_request.lora_path
-    assert "adapter_config.json" in os.listdir(lora_request.lora_path)
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import os
-from vllm import SamplingParams
-from ..utils import models_path_prefix
-MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_ranks(
-    vllm_runner,
-    model,
-    dtype,
-    example_prompts,
-):
-    max_tokens = 5
-    num_top_logprobs = 5
-    num_prompt_logprobs = 5
-    with vllm_runner(model, dtype=dtype,
-                     max_logprobs=num_top_logprobs) as vllm_model:
-        ## Test greedy logprobs ranks
-        vllm_sampling_params = SamplingParams(
-            temperature=0.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
-        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                      vllm_sampling_params)
-        ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
-        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
-    for result in res:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks
-        for token, logprobs in zip(result[0], result[2]):
-            assert logprobs[token].rank >= 1
--- a/tests/test_access_log_filter.py
+++ b/tests/test_access_log_filter.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for the UvicornAccessLogFilter class.
-"""
-import logging
-from vllm.logging_utils.access_log_filter import (
-    UvicornAccessLogFilter,
-    create_uvicorn_log_config,
-)
-class TestUvicornAccessLogFilter:
-    """Test cases for UvicornAccessLogFilter."""
-    def test_filter_allows_all_when_no_excluded_paths(self):
-        """Filter should allow all logs when no paths are excluded."""
-        filter = UvicornAccessLogFilter(excluded_paths=[])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_allows_all_when_excluded_paths_is_none(self):
-        """Filter should allow all logs when excluded_paths is None."""
-        filter = UvicornAccessLogFilter(excluded_paths=None)
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_health_endpoint(self):
-        """Filter should exclude /health endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_excludes_metrics_endpoint(self):
-        """Filter should exclude /metrics endpoint when configured."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_allows_non_excluded_endpoints(self):
-        """Filter should allow endpoints not in the excluded list."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is True
-    def test_filter_excludes_multiple_endpoints(self):
-        """Filter should exclude multiple configured endpoints."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
-        # Test /health
-        record_health = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_health) is False
-        # Test /metrics
-        record_metrics = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_metrics) is False
-        # Test /ping
-        record_ping = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_ping) is False
-    def test_filter_with_query_parameters(self):
-        """Filter should exclude endpoints even with query parameters."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record) is False
-    def test_filter_different_http_methods(self):
-        """Filter should exclude endpoints regardless of HTTP method."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
-        # Test GET
-        record_get = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_get) is False
-        # Test POST
-        record_post = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg='%s - "%s %s HTTP/%s" %d',
-            args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
-            exc_info=None,
-        )
-        assert filter.filter(record_post) is False
-    def test_filter_with_different_status_codes(self):
-        """Filter should exclude endpoints regardless of status code."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        for status_code in [200, 500, 503]:
-            record = logging.LogRecord(
-                name="uvicorn.access",
-                level=logging.INFO,
-                pathname="",
-                lineno=0,
-                msg='%s - "%s %s HTTP/%s" %d',
-                args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
-                exc_info=None,
-            )
-            assert filter.filter(record) is False
-class TestCreateUvicornLogConfig:
-    """Test cases for create_uvicorn_log_config function."""
-    def test_creates_valid_config_structure(self):
-        """Config should have required logging configuration keys."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "version" in config
-        assert config["version"] == 1
-        assert "disable_existing_loggers" in config
-        assert "formatters" in config
-        assert "handlers" in config
-        assert "loggers" in config
-        assert "filters" in config
-    def test_config_includes_access_log_filter(self):
-        """Config should include the access log filter."""
-        config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
-        assert "access_log_filter" in config["filters"]
-        filter_config = config["filters"]["access_log_filter"]
-        assert filter_config["()"] == UvicornAccessLogFilter
-        assert filter_config["excluded_paths"] == ["/health", "/metrics"]
-    def test_config_applies_filter_to_access_handler(self):
-        """Config should apply the filter to the access handler."""
-        config = create_uvicorn_log_config(excluded_paths=["/health"])
-        assert "access" in config["handlers"]
-        assert "filters" in config["handlers"]["access"]
-        assert "access_log_filter" in config["handlers"]["access"]["filters"]
-    def test_config_with_custom_log_level(self):
-        """Config should respect custom log level."""
-        config = create_uvicorn_log_config(
-            excluded_paths=["/health"], log_level="debug"
-        )
-        assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
-        assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
-    def test_config_with_empty_excluded_paths(self):
-        """Config should work with empty excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=[])
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-    def test_config_with_none_excluded_paths(self):
-        """Config should work with None excluded paths."""
-        config = create_uvicorn_log_config(excluded_paths=None)
-        assert config["filters"]["access_log_filter"]["excluded_paths"] == []
-class TestIntegration:
-    """Integration tests for the access log filter."""
-    def test_filter_with_real_logger(self):
-        """Test filter works with a real Python logger simulating uvicorn."""
-        # Create a logger with our filter (simulating uvicorn.access)
-        logger = logging.getLogger("uvicorn.access")
-        logger.setLevel(logging.INFO)
-        # Clear any existing handlers
-        logger.handlers = []
-        # Create a custom handler that tracks messages
-        logged_messages: list[str] = []
-        class TrackingHandler(logging.Handler):
-            def emit(self, record):
-                logged_messages.append(record.getMessage())
-        handler = TrackingHandler()
-        handler.setLevel(logging.INFO)
-        filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
-        handler.addFilter(filter)
-        logger.addHandler(handler)
-        # Log using uvicorn's format with args tuple
-        # Format: '%s - "%s %s HTTP/%s" %d'
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/health",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/v1/completions",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "GET",
-            "/metrics",
-            "1.1",
-            200,
-        )
-        logger.info(
-            '%s - "%s %s HTTP/%s" %d',
-            "127.0.0.1:12345",
-            "POST",
-            "/v1/chat/completions",
-            "1.1",
-            200,
-        )
-        # Verify only non-excluded endpoints were logged
-        assert len(logged_messages) == 2
-        assert "/v1/completions" in logged_messages[0]
-        assert "/v1/chat/completions" in logged_messages[1]
-    def test_filter_allows_non_uvicorn_access_logs(self):
-        """Test filter allows logs from non-uvicorn.access loggers."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record from a different logger name
-        record = logging.LogRecord(
-            name="uvicorn.error",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some error message about /health",
-            args=(),
-            exc_info=None,
-        )
-        # Should allow because it's not from uvicorn.access
-        assert filter.filter(record) is True
-    def test_filter_handles_malformed_args(self):
-        """Test filter handles log records with unexpected args format."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with insufficient args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message",
-            args=("only", "two"),
-            exc_info=None,
-        )
-        # Should allow because args doesn't have expected format
-        assert filter.filter(record) is True
-    def test_filter_handles_non_tuple_args(self):
-        """Test filter handles log records with non-tuple args."""
-        filter = UvicornAccessLogFilter(excluded_paths=["/health"])
-        # Log record with None args
-        record = logging.LogRecord(
-            name="uvicorn.access",
-            level=logging.INFO,
-            pathname="",
-            lineno=0,
-            msg="Some message without args",
-            args=None,
-            exc_info=None,
-        )
-        # Should allow because args is None
-        assert filter.filter(record) is True
--- a/tests/tokenization/test_tokenizer.py
+++ b/tests/tokenization/test_tokenizer.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import os
-from transformers import PreTrainedTokenizerBase
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from ..utils import models_path_prefix
-# TOKENIZER_NAMES = [
-#     os.path.join(models_path_prefix, "facebook/opt-125m"),
-#     os.path.join(models_path_prefix, "gpt2"),
-# ]
-# export HF_ENDPOINT=https://hf-mirror.com
-TOKENIZER_NAMES = [
-    "facebook/opt-125m",
-    "gpt2",
-]
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
-    # Assume that "main" branch always exists
-    # tokenizer = get_tokenizer(tokenizer_name, revision="main")
-    tokenizer = get_tokenizer(tokenizer_name)
-    assert isinstance(tokenizer, PreTrainedTokenizerBase)
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_tokenizer(tokenizer_name, revision="never")
--- a/tests/tool_parsers/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: E501
+import os
 import json
 import pytest
@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.glm4_moe_tool_parser import (
    Glm4MoeModelToolParser,
 )
+from ..utils import models_path_prefix
 pytest.skip("skip glm4_moe parser test", allow_module_level=True)
 # Use a common model that is likely to be available
-MODEL = "zai-org/GLM-4.5"
+MODEL = os.path.join(models_path_prefix, "zai-org/GLM-4.5")
 @pytest.fixture(scope="module")
@@ -225,6 +227,7 @@ def test_extract_tool_calls(
 def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser):
    """Test tool extraction when thinking tags are present."""
    model_output = """<think>I want to get the weather.</think>
 I will help you get the weather.
 <tool_call>get_weather
 <arg_key>city</arg_key>
@@ -242,6 +245,7 @@ I will help you get the weather.
    assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
    expected_content = """<think>I want to get the weather.</think>
 I will help you get the weather."""
    assert extracted_tool_calls.content == expected_content
@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
 def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser):
    """Test extraction with mixed content and multiple tool calls."""
    model_output = """I will help you get the weather info.
 <tool_call>get_weather
 <arg_key>city</arg_key>
 <arg_value>Beijing</arg_value>
@@ -443,4 +448,4 @@ def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser):
    # Incomplete tool calls should not be extracted
    assert not extracted_tool_calls.tools_called
    assert extracted_tool_calls.tool_calls == []
    assert extracted_tool_calls.content == model_output
\ No newline at end of file
--- a/tests/tpu/lora/untest_lora.py
+++ b/tests/tpu/lora/untest_lora.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from torch_xla._internal import tpu
-import vllm
-from vllm.lora.request import LoRARequest
-# This file contains tests to ensure that LoRA works correctly on the TPU
-# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
-# for this. The adapters are:
-# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
-# from 1 to 4.
-# These adapters are trained using a standard huggingface peft training script,
-# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
-# 100 training iterations with a training batch size of 100.
-def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
-    return vllm.LLM(
-        model="Qwen/Qwen2.5-3B-Instruct",
-        max_model_len=256,
-        max_num_seqs=8,
-        tensor_parallel_size=tp,
-        enable_lora=True,
-        max_loras=num_loras,
-        max_lora_rank=8,
-    )
-TPU_TENSOR_PARALLEL_SIZES = (
-    [1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
-)
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_single_lora(tp: int):
-    """
-    This test ensures we can run a single LoRA adapter on the TPU backend.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=1.
-    """
-    llm = setup_vllm(1, tp)
-    prompt = "What is 1+1? \n"
-    lora_request = LoRARequest(
-        "lora_adapter_1",
-        1,
-        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
-    )
-    output = (
-        llm.generate(
-            prompt,
-            sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-            lora_request=lora_request,
-        )[0]
-        .outputs[0]
-        .text
-    )
-    answer = output.strip()[0]
-    assert answer.isdigit()
-    assert int(answer) == 1
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_lora_hotswapping(tp: int):
-    """
-    This test ensures we can run multiple LoRA adapters on the TPU backend, even
-    if we only have space to store 1.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
-    """
-    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
-    lora_requests = [
-        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
-        for i in range(1, 5)
-    ]
-    llm = setup_vllm(1, tp)
-    prompt = "What is 1+1? \n"
-    for i, req in enumerate(lora_requests):
-        output = (
-            llm.generate(
-                prompt,
-                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-                lora_request=req,
-            )[0]
-            .outputs[0]
-            .text
-        )
-        answer = output.strip()[0]
-        assert answer.isdigit()
-        assert int(answer) == i + 1
-@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
-def test_multi_lora(tp: int):
-    """
-    This test ensures we can run multiple LoRA adapters on the TPU backend, when
-    we have enough space to store all of them.
-    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
-    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
-    """
-    lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
-    lora_requests = [
-        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
-        for i in range(1, 5)
-    ]
-    llm = setup_vllm(4, tp)
-    prompt = "What is 1+1? \n"
-    for i, req in enumerate(lora_requests):
-        output = (
-            llm.generate(
-                prompt,
-                sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
-                lora_request=req,
-            )[0]
-            .outputs[0]
-            .text
-        )
-        answer = output.strip()[0]
-        assert answer.isdigit()
-        assert int(output.strip()[0]) == i + 1
--- a/tests/tpu/untest_compilation.py
+++ b/tests/tpu/untest_compilation.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import glob
-import os
-import tempfile
-import depyf
-def test_tpu_compilation():
-    temp_dir = tempfile.mkdtemp()
-    with depyf.prepare_debug(temp_dir):
-        from vllm import LLM, SamplingParams
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-            "The greatest glory in living lies not in never falling,",
-        ]
-        answers = [
-            " or, through inaction",
-            " what is essential ",
-            " but in rising ",
-        ]
-        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-        N = 1
-        sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
-        llm = LLM(
-            model="Qwen/Qwen2-1.5B-Instruct",
-            max_num_batched_tokens=256,
-            max_model_len=256,
-            max_num_seqs=32,
-            enforce_eager=False,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-        for output, answer in zip(outputs, answers):
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            assert generated_text.startswith(answer)
-    compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
-    )
-    for i, compiled_code in enumerate(compiled_codes):
-        print("{} file: {}".format(i + 1, compiled_code))
-    # We should only trigger Dynamo compilation 2 times:
-    # 1. Forward pass without kv_caches
-    # 2. Forward pass with kv_caches
-    # Check we have 2 compiled codes
-    assert len(compiled_codes) == 2
-    kv_cache_prefix = "kv_cache"
-    attn_prefix = "ragged_paged_attention"
-    def extract_compiled_index(s):
-        parts = s.replace(".", "_").split("_")
-        numbers = [int(part) for part in parts if part.isdigit()]
-        return numbers[0]
-    # Check all the compilations are as expected. The dump files include the
-    # captured graph for the forward function of the nn.Module.
-    compiled_fns = sorted(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
-        key=lambda s: extract_compiled_index(s),
-    )
-    for i, compiled_fn in enumerate(compiled_fns):
-        print("{} file: {}".format(i + 1, compiled_fn))
-    # The first compilation should not have any kv_caches
-    with open(compiled_fns[0]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-    # The second compilation should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix in content and attn_prefix in content
--- a/tests/tpu/untest_custom_dispatcher.py
+++ b/tests/tpu/untest_custom_dispatcher.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-import pytest
-from vllm.config import CompilationMode
-from ..utils import compare_two_settings, models_path_prefix
-# --enforce-eager on TPU causes graph compilation
-# this times out default Health Check in the MQLLMEngine,
-# so we set the timeout here to 30s
-def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings(
-            os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
-            arg1=[
-                "--max-model-len=256",
-                "--max-num-seqs=32",
-                "--enforce-eager",
-                f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
-            ],
-            arg2=[
-                "--max-model-len=256",
-                "--max-num-seqs=32",
-                "--enforce-eager",
-                f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
-            ],
-            env1={},
-            env2={},
-        )
--- a/tests/tpu/untest_moe_pallas.py
+++ b/tests/tpu/untest_moe_pallas.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the Pallas MOE implementation.
-Run `pytest tests/kernels/moe/test_moe_pallas.py`.
-"""
-import pytest
-import torch
-import torch_xla
-from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
-from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
-    fused_moe as torch_moe,
-)
-from vllm.platforms import current_platform
-if not current_platform.is_tpu():
-    pytest.skip("This test needs a TPU.", allow_module_level=True)
-NUM_EXPERTS = [8, 64]
-EP_SIZE = [1]
-TOP_KS = [2, 6]
-# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
-@pytest.mark.parametrize("m", [8, 16, 64, 2048])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-def test_pallas_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    ep_size: int,
-    dtype: torch.dtype,
-):
-    import torch_xla.core.xla_model as xm
-    with torch.device(xm.xla_device()):
-        a = torch.randn((m, k), dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), dtype=dtype) / 10
-        score = torch.randn((m, e), dtype=dtype)
-        # TODO: Support ep
-        if ep_size > 1:
-            pytest.skip("No support for ep_size > 1 yet")
-        else:
-            e_map = None
-        # Run both implementations
-        torch_output = torch_moe(
-            hidden_states=a,
-            w1=w1,
-            w2=w2,
-            gating_output=score,
-            topk=topk,
-            global_num_experts=e,
-            expert_map=e_map,
-            renormalize=False,
-        )
-        pallas_output = pallas_moe(
-            hidden_states=a,
-            w1=w1,
-            w2=w2,
-            gating_output=score,
-            topk=topk,
-            global_num_experts=e,
-            expert_map=e_map,
-            renormalize=False,
-        )
-        torch_xla.sync(wait=False)
-    # Compare outputs
-    torch.testing.assert_close(
-        pallas_output.cpu(),
-        torch_output.cpu(),
-        atol=2e-2,
-        rtol=0,
-    )
--- a/tests/tpu/untest_quantization_accuracy.py
+++ b/tests/tpu/untest_quantization_accuracy.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-import lm_eval
-import pytest
-TASK = "gsm8k"
-FILTER = "exact_match,strict-match"
-RTOL = 0.03
-@dataclass
-class GSM8KAccuracyTestConfig:
-    model_name: str
-    expected_value: float
-    def get_model_args(self) -> str:
-        return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
-# NOTE: Accuracy scores measured on GPUs.
-ACCURACY_CONFIGS = [
-    GSM8KAccuracyTestConfig(
-        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        expected_value=0.76,
-    ),  # no bias
-    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
-    # so only one of these tests can run in a single call to pytest. As
-    # a follow-up, move this into the LM-EVAL section of the CI.
-    # GSM8KAccuracyTestConfig(
-    #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
-    #     expected_value=0.66),  # bias in QKV layers
-]
-@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
-def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=config.get_model_args(),
-        tasks="gsm8k",
-        batch_size="auto",
-    )
-    EXPECTED_VALUE = config.expected_value
-    measured_value = results["results"][TASK][FILTER]
-    assert (
-        measured_value - RTOL < EXPECTED_VALUE
-        and measured_value + RTOL > EXPECTED_VALUE
-    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# ruff: noqa
-import asyncio
-import hashlib
-import json
-import os
-import pickle
-import socket
-import tempfile
-from collections.abc import AsyncIterator
-from pathlib import Path
-from unittest.mock import patch
-import os
-import pytest
-import torch
-import yaml
-import zmq
-from transformers import AutoTokenizer
-from vllm_test_utils.monitor import monitor
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.transformers_utils.detokenizer_utils import (
-    convert_ids_list_to_tokens)
-# isort: off
-from vllm.utils import (
-    CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
-    PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
-    current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
-    is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
-    memory_profiling, merge_async_iterators, sha256, split_host_port,
-    split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
-# isort: on
-from ..utils import create_new_process_for_each_test, error_on_warning, models_path_prefix
-@pytest.mark.asyncio
-async def test_merge_async_iterators():
-    async def mock_async_iterator(idx: int):
-        try:
-            while True:
-                yield f"item from iterator {idx}"
-                await asyncio.sleep(0.1)
-        except asyncio.CancelledError:
-            print(f"iterator {idx} cancelled")
-    iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators)
-    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
-        async for idx, output in generator:
-            print(f"idx: {idx}, output: {output}")
-    task = asyncio.create_task(stream_output(merged_iterator))
-    await asyncio.sleep(0.5)
-    task.cancel()
-    with pytest.raises(asyncio.CancelledError):
-        await task
-    for iterator in iterators:
-        try:
-            # Can use anext() in python >= 3.10
-            await asyncio.wait_for(iterator.__anext__(), 1)
-        except StopAsyncIteration:
-            # All iterators should be cancelled and print this message.
-            print("Iterator was cancelled normally")
-        except (Exception, asyncio.CancelledError) as e:
-            raise AssertionError() from e
-def test_deprecate_kwargs_always():
-    @deprecate_kwargs("old_arg", is_deprecated=True)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-def test_deprecate_kwargs_never():
-    @deprecate_kwargs("old_arg", is_deprecated=False)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-def test_deprecate_kwargs_dynamic():
-    is_deprecated = True
-    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-    is_deprecated = False
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-def test_deprecate_kwargs_additional_message():
-    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-    with pytest.warns(DeprecationWarning, match="abcd"):
-        dummy(old_arg=1)
-def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_PORT", "5678")
-        # make sure we can get multiple ports, even if the env var is set
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-            s1.bind(("localhost", get_open_port()))
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-                s2.bind(("localhost", get_open_port()))
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-                    s3.bind(("localhost", get_open_port()))
-# Tests for FlexibleArgumentParser
-@pytest.fixture
-def parser():
-    parser = FlexibleArgumentParser()
-    parser.add_argument('--image-input-type',
-                        choices=['pixel_values', 'image_features'])
-    parser.add_argument('--model-name')
-    parser.add_argument('--batch-size', type=int)
-    parser.add_argument('--enable-feature', action='store_true')
-    parser.add_argument('--hf-overrides', type=json.loads)
-    parser.add_argument('-O', '--compilation-config', type=json.loads)
-    return parser
-@pytest.fixture
-def parser_with_config():
-    parser = FlexibleArgumentParser()
-    parser.add_argument('serve')
-    parser.add_argument('model_tag', nargs='?')
-    parser.add_argument('--model', type=str)
-    parser.add_argument('--served-model-name', type=str)
-    parser.add_argument('--config', type=str)
-    parser.add_argument('--port', type=int)
-    parser.add_argument('--tensor-parallel-size', type=int)
-    parser.add_argument('--trust-remote-code', action='store_true')
-    return parser
-def test_underscore_to_dash(parser):
-    args = parser.parse_args(['--image_input_type', 'pixel_values'])
-    assert args.image_input_type == 'pixel_values'
-def test_mixed_usage(parser):
-    args = parser.parse_args([
-        '--image_input_type', 'image_features', '--model-name',
-        os.path.join(models_path_prefix, 'facebook/opt-125m')
-    ])
-    assert args.image_input_type == 'image_features'
-    assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
-def test_with_equals_sign(parser):
-    model_name_with_path = os.path.join(models_path_prefix, 'facebook/opt-125m')
-    args = parser.parse_args(
-        ['--image_input_type=pixel_values', f'--model-name={model_name_with_path}'])
-    assert args.image_input_type == 'pixel_values'
-    assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
-def test_with_int_value(parser):
-    args = parser.parse_args(['--batch_size', '32'])
-    assert args.batch_size == 32
-    args = parser.parse_args(['--batch-size', '32'])
-    assert args.batch_size == 32
-def test_with_bool_flag(parser):
-    args = parser.parse_args(['--enable_feature'])
-    assert args.enable_feature is True
-    args = parser.parse_args(['--enable-feature'])
-    assert args.enable_feature is True
-def test_invalid_choice(parser):
-    with pytest.raises(SystemExit):
-        parser.parse_args(['--image_input_type', 'invalid_choice'])
-def test_missing_required_argument(parser):
-    parser.add_argument('--required-arg', required=True)
-    with pytest.raises(SystemExit):
-        parser.parse_args([])
-def test_cli_override_to_config(parser_with_config, cli_config_file):
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', cli_config_file,
-        '--tensor-parallel-size', '3'
-    ])
-    assert args.tensor_parallel_size == 3
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file
-    ])
-    assert args.tensor_parallel_size == 3
-    assert args.port == 12312
-    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file, '--port', '666'
-    ])
-    assert args.tensor_parallel_size == 3
-    assert args.port == 666
-def test_config_args(parser_with_config, cli_config_file):
-    args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', cli_config_file])
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code
-def test_config_file(parser_with_config):
-    with pytest.raises(FileNotFoundError):
-        parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', 'test_config.yml'])
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', 'mymodel', '--config', './data/test_config.json'])
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args([
-            'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-            '--batch-size', '32'
-        ])
-def test_no_model_tag(parser_with_config, cli_config_file):
-    with pytest.raises(ValueError):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-def test_dict_args(parser):
-    args = [
-        "--model-name=something.something",
-        "--hf-overrides.key1",
-        "val1",
-        # Test nesting
-        "--hf-overrides.key2.key3",
-        "val2",
-        "--hf-overrides.key2.key4",
-        "val3",
-        # Test compile config and compilation level
-        "-O.use_inductor=true",
-        "-O.backend",
-        "custom",
-        "-O1",
-        # Test = sign
-        "--hf-overrides.key5=val4",
-        # Test underscore to dash conversion
-        "--hf_overrides.key_6",
-        "val5",
-        "--hf_overrides.key-7.key_8",
-        "val6",
-        # Test data type detection
-        "--hf_overrides.key9",
-        "100",
-        "--hf_overrides.key10",
-        "100.0",
-        "--hf_overrides.key11",
-        "true",
-        "--hf_overrides.key12.key13",
-        "null",
-        # Test '-' and '.' in value
-        "--hf_overrides.key14.key15",
-        "-minus.and.dot",
-        # Test array values
-        "-O.custom_ops+",
-        "-quant_fp8",
-        "-O.custom_ops+=+silu_mul,-rms_norm",
-    ]
-    parsed_args = parser.parse_args(args)
-    assert parsed_args.model_name == "something.something"
-    assert parsed_args.hf_overrides == {
-        "key1": "val1",
-        "key2": {
-            "key3": "val2",
-            "key4": "val3",
-        },
-        "key5": "val4",
-        "key_6": "val5",
-        "key-7": {
-            "key_8": "val6",
-        },
-        "key9": 100,
-        "key10": 100.0,
-        "key11": True,
-        "key12": {
-            "key13": None,
-        },
-        "key14": {
-            "key15": "-minus.and.dot",
-        }
-    }
-    assert parsed_args.compilation_config == {
-        "level": 1,
-        "use_inductor": True,
-        "backend": "custom",
-        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
-    }
-def test_duplicate_dict_args(caplog_vllm, parser):
-    args = [
-        "--model-name=something.something",
-        "--hf-overrides.key1",
-        "val1",
-        "--hf-overrides.key1",
-        "val2",
-        "-O1",
-        "-O.level",
-        "2",
-        "-O3",
-    ]
-    parsed_args = parser.parse_args(args)
-    # Should be the last value
-    assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"level": 3}
-    assert len(caplog_vllm.records) == 1
-    assert "duplicate" in caplog_vllm.text
-    assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.level" in caplog_vllm.text
-# yapf: enable
-@pytest.mark.parametrize(
-    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
-    [
-        # Tests for positional argument support
-        (lambda foo: None, "foo", True, True, False),
-        (lambda foo: None, "foo", False, True, True),
-        # Tests for positional or keyword / keyword only
-        (lambda foo=100: None, "foo", True, True, False),
-        (lambda *, foo: None, "foo", False, True, True),
-        # Tests to make sure the names of variadic params are NOT supported
-        (lambda *args: None, "args", False, True, False),
-        (lambda **kwargs: None, "kwargs", False, True, False),
-        # Tests for if we allow var kwargs to add support
-        (lambda foo: None, "something_else", False, True, False),
-        (lambda foo, **kwargs: None, "something_else", False, True, True),
-        (lambda foo, **kwargs: None, "kwargs", True, True, False),
-        (lambda foo, **kwargs: None, "foo", True, True, False),
-    ])
-# yapf: disable
-def test_supports_kw(callable,kw_name,requires_kw_only,
-                     allow_var_kwargs,is_supported):
-    assert supports_kw(
-            callable=callable,
-            kw_name=kw_name,
-            requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs
-    ) == is_supported
-@create_new_process_for_each_test()
-def test_memory_profiling():
-    # Fake out some model loading + inference memory usage to test profiling
-    # Memory used by other processes will show up as cuda usage outside of torch
-    from vllm.distributed.device_communicators.cuda_wrapper import (
-        CudaRTLibrary)
-    lib = CudaRTLibrary()
-    # 512 MiB allocation outside of this instance
-    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
-    baseline_snapshot = MemorySnapshot()
-    # load weights
-    weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
-    weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
-    def measure_current_non_torch():
-        free, total = torch.cuda.mem_get_info()
-        current_used = total - free
-        current_torch = torch.cuda.memory_reserved()
-        current_non_torch = current_used - current_torch
-        return current_non_torch
-    with memory_profiling(baseline_snapshot=baseline_snapshot,
-    weights_memory=weights_memory) as result, \
-        monitor(measure_current_non_torch) as monitored_values:
-        # make a memory spike, 1 GiB
-        spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
-        del spike
-        # Add some extra non-torch memory 256 MiB (simulate NCCL)
-        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
-    # this is an analytic value, it is exact,
-    # we only have 256 MiB non-torch memory increase
-    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
-    assert measured_diff == 256 * 1024 * 1024
-    # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by cuda runtime.
-    # we cannot control cuda runtime in the granularity of bytes,
-    # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
-    assert abs(non_torch_ratio - 1) <= 0.05
-    assert result.torch_peak_increase == 1024 * 1024 * 1024
-    del weights
-    lib.cudaFree(handle1)
-    lib.cudaFree(handle2)
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    shared_kv_cache_layers = {
-        'layers.2.self_attn': 'layers.1.self_attn',
-        'layers.3.self_attn': 'layers.0.self_attn'
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-    # example from Jamba PP=2
-    ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1, )),
-        torch.zeros((1, )),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(
-            parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-        ctx = {
-            'layers.0.self_attn': Attention(32, 128, 0.1),
-        }
-        kv_cache = [
-            [torch.zeros((1, ))],
-            [torch.zeros((1, ))]
-        ]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
-        assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
-class TestLRUCache(LRUCache):
-    def _on_remove(self, key, value):
-        if not hasattr(self, "_remove_counter"):
-            self._remove_counter = 0
-        self._remove_counter += 1
-def test_lru_cache():
-    cache = TestLRUCache(3)
-    assert cache.stat() == CacheInfo(hits=0, total=0)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
-    cache.put(1, 1)
-    assert len(cache) == 1
-    cache.put(1, 1)
-    assert len(cache) == 1
-    cache.put(2, 2)
-    assert len(cache) == 2
-    cache.put(3, 3)
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-    cache.put(4, 4)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache.get(2) == 2
-    assert cache.stat() == CacheInfo(hits=1, total=1)
-    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
-    assert cache[2] == 2
-    assert cache.stat() == CacheInfo(hits=2, total=2)
-    assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
-    cache.put(5, 5)
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-    assert cache.pop(5) == 5
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-    assert cache.get(-1) is None
-    assert cache.stat() == CacheInfo(hits=2, total=3)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-    cache.get(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-    cache.put(6, 6)
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-    cache.remove_oldest()
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 6}
-    assert cache._remove_counter == 4
-    cache.clear()
-    assert len(cache) == 0
-    assert cache._remove_counter == 6
-    assert cache.stat() == CacheInfo(hits=0, total=0)
-    assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
-    cache._remove_counter = 0
-    cache[1] = 1
-    assert len(cache) == 1
-    cache[1] = 1
-    assert len(cache) == 1
-    cache[2] = 2
-    assert len(cache) == 2
-    cache[3] = 3
-    assert len(cache) == 3
-    assert set(cache.cache) == {1, 2, 3}
-    cache[4] = 4
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 3, 4}
-    assert cache._remove_counter == 1
-    assert cache[2] == 2
-    cache[5] = 5
-    assert set(cache.cache) == {2, 4, 5}
-    assert cache._remove_counter == 2
-    del cache[5]
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-    cache.pop(10)
-    assert len(cache) == 2
-    assert set(cache.cache) == {2, 4}
-    assert cache._remove_counter == 3
-    cache[6] = 6
-    assert len(cache) == 3
-    assert set(cache.cache) == {2, 4, 6}
-    assert 2 in cache
-    assert 4 in cache
-    assert 6 in cache
-# yapf: disable
-@pytest.mark.parametrize(
-    ("src_dtype", "tgt_dtype", "expected_result"),
-    [
-        # Different precision_levels
-        (torch.bool, torch.int8, True),
-        (torch.bool, torch.float16, True),
-        (torch.bool, torch.complex32, True),
-        (torch.int64, torch.bool, False),
-        (torch.int64, torch.float16, True),
-        (torch.int64, torch.complex32, True),
-        (torch.float64, torch.bool, False),
-        (torch.float64, torch.int8, False),
-        (torch.float64, torch.complex32, True),
-        (torch.complex128, torch.bool, False),
-        (torch.complex128, torch.int8, False),
-        (torch.complex128, torch.float16, False),
-        # precision_level=0
-        (torch.bool, torch.bool, True),
-        # precision_level=1
-        (torch.int8, torch.int16, True),
-        (torch.int16, torch.int8, False),
-        (torch.uint8, torch.int8, False),
-        (torch.int8, torch.uint8, False),
-        # precision_level=2
-        (torch.float16, torch.float32, True),
-        (torch.float32, torch.float16, False),
-        (torch.bfloat16, torch.float32, True),
-        (torch.float32, torch.bfloat16, False),
-        # precision_level=3
-        (torch.complex32, torch.complex64, True),
-        (torch.complex64, torch.complex32, False),
-    ],
-)
-# yapf: enable
-def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
-    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
-# yapf: disable
-@pytest.mark.parametrize(
-    ("dtypes", "expected_result"),
-    [
-        ([torch.bool], torch.bool),
-        ([torch.bool, torch.int8], torch.int8),
-        ([torch.bool, torch.int8, torch.float16], torch.float16),
-        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
-    ],
-)
-# yapf: enable
-def test_common_broadcastable_dtype(dtypes, expected_result):
-    assert common_broadcastable_dtype(dtypes) == expected_result
-def test_placeholder_module_error_handling():
-    placeholder = PlaceholderModule("placeholder_1234")
-    def build_ctx():
-        return pytest.raises(ModuleNotFoundError, match="No module named")
-    with build_ctx():
-        int(placeholder)
-    with build_ctx():
-        placeholder()
-    with build_ctx():
-        _ = placeholder.some_attr
-    with build_ctx():
-        # Test conflict with internal __name attribute
-        _ = placeholder.name
-    # OK to print the placeholder or use it in a f-string
-    _ = repr(placeholder)
-    _ = str(placeholder)
-    # No error yet; only error when it is used downstream
-    placeholder_attr = placeholder.placeholder_attr("attr")
-    with build_ctx():
-        int(placeholder_attr)
-    with build_ctx():
-        placeholder_attr()
-    with build_ctx():
-        _ = placeholder_attr.some_attr
-    with build_ctx():
-        # Test conflict with internal __module attribute
-        _ = placeholder_attr.module
-# yapf: disable
-@pytest.mark.parametrize(
-    "obj,key1,key2",
-    [
-        # Tests for both keys exist
-        ({1: "a", 2: "b"}, 1, 2),
-        # Tests for one key does not exist
-        ({1: "a", 2: "b"}, 1, 3),
-        # Tests for both keys do not exist
-        ({1: "a", 2: "b"}, 3, 4),
-    ])
-# yapf: enable
-def test_swap_dict_values(obj, key1, key2):
-    original_obj = obj.copy()
-    swap_dict_values(obj, key1, key2)
-    if key1 in original_obj:
-        assert obj[key2] == original_obj[key1]
-    else:
-        assert key2 not in obj
-    if key2 in original_obj:
-        assert obj[key1] == original_obj[key2]
-    else:
-        assert key1 not in obj
-def test_model_specification(parser_with_config, cli_config_file,
-                             cli_config_file_with_model):
-    # Test model in CLI takes precedence over config
-    args = parser_with_config.parse_args(
-        ['serve', 'cli-model', '--config', cli_config_file_with_model])
-    assert args.model_tag == 'cli-model'
-    assert args.served_model_name == 'mymodel'
-    # Test model from config file works
-    args = parser_with_config.parse_args([
-        'serve',
-        '--config',
-        cli_config_file_with_model,
-    ])
-    assert args.model == 'config-model'
-    assert args.served_model_name == 'mymodel'
-    # Test no model specified anywhere raises error
-    with pytest.raises(ValueError, match="No model specified!"):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-    # Test using --model option raises error
-    with pytest.raises(
-            ValueError,
-            match=
-        ("With `vllm serve`, you should provide the model as a positional "
-         "argument or in a config file instead of via the `--model` option."),
-    ):
-        parser_with_config.parse_args(['serve', '--model', 'my-model'])
-    # Test other config values are preserved
-    args = parser_with_config.parse_args([
-        'serve',
-        'cli-model',
-        '--config',
-        cli_config_file_with_model,
-    ])
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code is True
-    assert args.port == 12312
-@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
-                                   (None, bool, [1, 2, 3])])
-def test_sha256(input: tuple):
-    digest = sha256(input)
-    assert digest is not None
-    assert isinstance(digest, bytes)
-    assert digest != b""
-    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert digest == hashlib.sha256(input_bytes).digest()
-    # hashing again, returns the same value
-    assert digest == sha256(input)
-    # hashing different input, returns different value
-    assert digest != sha256(input + (1, ))
-@pytest.mark.parametrize(
-    "path,expected",
-    [
-        ("ipc://some_path", ("ipc", "some_path", "")),
-        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
-        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
-        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
-    ])
-def test_split_zmq_path(path, expected):
-    assert split_zmq_path(path) == expected
-@pytest.mark.parametrize(
-    "invalid_path",
-    [
-        "invalid_path",  # Missing scheme
-        "tcp://127.0.0.1",  # Missing port
-        "tcp://[::1]",  # Missing port for IPv6
-        "tcp://:5555",  # Missing host
-    ])
-def test_split_zmq_path_invalid(invalid_path):
-    with pytest.raises(ValueError):
-        split_zmq_path(invalid_path)
-def test_make_zmq_socket_ipv6():
-    # Check if IPv6 is supported by trying to create an IPv6 socket
-    try:
-        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
-        sock.close()
-    except socket.error:
-        pytest.skip("IPv6 is not supported on this system")
-    ctx = zmq.Context()
-    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
-    socket_type = zmq.REP  # Example socket type
-    # Create the socket
-    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
-    # Verify that the IPV6 option is set
-    assert zsock.getsockopt(
-        zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
-    # Clean up
-    zsock.close()
-    ctx.term()
-def test_make_zmq_path():
-    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
-    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
-def test_get_tcp_uri():
-    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
-    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
-def test_split_host_port():
-    # valid ipv4
-    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
-    # invalid ipv4
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("127.0.0.1::5555")
-    with pytest.raises(ValueError):
-        # tailing colon
-        assert split_host_port("127.0.0.1:5555:")
-    with pytest.raises(ValueError):
-        # no colon
-        assert split_host_port("127.0.0.15555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("127.0.0.1:5555a")
-    # valid ipv6
-    assert split_host_port("[::1]:5555") == ("::1", 5555)
-    # invalid ipv6
-    with pytest.raises(ValueError):
-        # multi colon
-        assert split_host_port("[::1]::5555")
-    with pytest.raises(IndexError):
-        # no colon
-        assert split_host_port("[::1]5555")
-    with pytest.raises(ValueError):
-        # none int port
-        assert split_host_port("[::1]:5555a")
-def test_join_host_port():
-    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
-    assert join_host_port("::1", 5555) == "[::1]:5555"
-def test_json_count_leaves():
-    """Test json_count_leaves function from jsontree utility."""
-    from vllm.utils.jsontree import json_count_leaves
-    # Single leaf values
-    assert json_count_leaves(42) == 1
-    assert json_count_leaves("hello") == 1
-    assert json_count_leaves(None) == 1
-    # Empty containers
-    assert json_count_leaves([]) == 0
-    assert json_count_leaves({}) == 0
-    assert json_count_leaves(()) == 0
-    # Flat structures
-    assert json_count_leaves([1, 2, 3]) == 3
-    assert json_count_leaves({"a": 1, "b": 2}) == 2
-    assert json_count_leaves((1, 2, 3)) == 3
-    # Nested structures
-    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
-    assert json_count_leaves(nested_dict) == 3
-    nested_list = [1, [2, 3], 4]
-    assert json_count_leaves(nested_list) == 4
-    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
-    assert json_count_leaves(mixed_nested) == 4
-def test_convert_ids_list_to_tokens():
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
-    token_ids = tokenizer.encode("Hello, world!")
-    # token_ids = [9707, 11, 1879, 0]
-    assert tokenizer.convert_ids_to_tokens(token_ids) == [
-        'Hello', ',', 'Ġworld', '!'
-    ]
-    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
-    assert tokens == ['Hello', ',', ' world', '!']
-def test_current_stream_multithread():
-    import threading
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-    main_default_stream = torch.cuda.current_stream()
-    child_stream = torch.cuda.Stream()
-    thread_stream_ready = threading.Event()
-    thread_can_exit = threading.Event()
-    def child_thread_func():
-        with torch.cuda.stream(child_stream):
-            thread_stream_ready.set()
-            thread_can_exit.wait(timeout=10)
-    child_thread = threading.Thread(target=child_thread_func)
-    child_thread.start()
-    try:
-        assert thread_stream_ready.wait(
-            timeout=5), "Child thread failed to enter stream context in time"
-        main_current_stream = current_stream()
-        assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
-        assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
-        # Notify child thread it can exit
-        thread_can_exit.set()
-    finally:
-        # Ensure child thread exits properly
-        child_thread.join(timeout=5)
-        if child_thread.is_alive():
-            pytest.fail("Child thread failed to exit properly")
-def test_load_config_file(tmp_path):
-    # Define the configuration data
-    config_data = {
-        "enable-logging": True,
-        "list-arg": ["item1", "item2"],
-        "port": 12323,
-        "tensor-parallel-size": 4
-    }
-    # Write the configuration data to a temporary YAML file
-    config_file_path = tmp_path / "config.yaml"
-    with open(config_file_path, "w") as config_file:
-        yaml.dump(config_data, config_file)
-    # Initialize the parser
-    parser = FlexibleArgumentParser()
-    # Call the function with the temporary file path
-    processed_args = parser.load_config_file(str(config_file_path))
-    # Expected output
-    expected_args = [
-        "--enable-logging",
-        "--list-arg",
-        "item1",
-        "item2",
-        "--port",
-        "12323",
-        "--tensor-parallel-size",
-        "4",
-    ]
-    # Assert that the processed arguments match the expected output
-    assert processed_args == expected_args
-    os.remove(str(config_file_path))
-def test_unique_filepath():
-    temp_dir = tempfile.mkdtemp()
-    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
-    paths = set()
-    for i in range(10):
-        path = unique_filepath(path_fn)
-        path.write_text("test")
-        paths.add(path)
-    assert len(paths) == 10
-    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
 def make_output(scheduler: Scheduler):
    return ModelRunnerOutput(
        req_ids=[req.request_id for req in scheduler.running],

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -112,13 +112,6 @@ def create_vllm_config(
        enable_chunked_prefill=enable_chunked_prefill,
        is_encoder_decoder=model_config.is_encoder_decoder,
    )
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-        is_encoder_decoder=model_config.is_encoder_decoder,
-    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
        block_size=block_size,