Merge remote-tracking branch 'origin/v0.8.5.post1-dev' into v0.8.5.post1-dev

8e340b4f · yangql · 1cb37dab · a68aef25 · 8e340b4f · 8e340b4f
Commit 8e340b4f authored Jun 05, 2025 by yangql
16 changed files
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import glob
 import tempfile
@@ -9,6 +10,7 @@ import torch
 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf, runai_safetensors_weights_iterator,
    safetensors_weights_iterator)
+from ..utils import models_path_prefix
 def test_runai_model_loader():
@@ -23,10 +25,10 @@ def test_runai_model_loader():
        runai_model_streamer_tensors = {}
        hf_safetensors_tensors = {}
-        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+        for name, tensor in runai_safetensors_weights_iterator(safetensors, False):
            runai_model_streamer_tensors[name] = tensor
-        for name, tensor in safetensors_weights_iterator(safetensors):
+        for name, tensor in safetensors_weights_iterator(safetensors, False):
            hf_safetensors_tensors[name] = tensor
        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)

--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -43,7 +43,8 @@ def _generate(
 class TestOneTokenBadWord:
-    MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
+    # MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
+    MODEL = "TheBloke/Llama-2-7B-fp16"
    PROMPT = "Hi! How are"
    TARGET_TOKEN = "you"

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -7,16 +7,15 @@ import pathlib
 import subprocess
 from functools import partial
 from unittest.mock import MagicMock, patch
-from typing import List, Tuple, Optional
 import openai
 import pytest
 import torch
 from huggingface_hub import snapshot_download
+from typing import List, Tuple, Optional
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.lora.request import LoRARequest
 # yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         open_stream,
                                                         serialize_vllm_model,
                                                         tensorize_vllm_model)
+from vllm.lora.request import LoRARequest
 # yapf: enable
 from vllm.utils import PlaceholderModule, import_from_path

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
            AutoTokenizer.from_pretrained(tokenizer_name))
-@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize("tokenizer_name", [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")])
 @pytest.mark.parametrize(
    "truth",
    [

--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+# export HF_ENDPOINT=https://hf-mirror.com
 @pytest.mark.asyncio
 async def test_tokenizer_group():
-    reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
+    # reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
+    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer_group = TokenizerGroup(
-        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
+        # tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
+import os
 import pytest
 from vllm import LLM, SamplingParams
 from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
                                                               prep_prompts)
+from ...utils import models_path_prefix
 @dataclass
@@ -16,16 +18,16 @@ class TestConfig:
 model_config = {
-    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    os.path.join(models_path_prefix, "google/gemma-2-2b-it"): TestConfig(4096, (400, 800)),
 }
 @pytest.mark.parametrize(
    "model",
    [
-        "bigcode/starcoder2-3b",  # sliding window only
+        os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        os.path.join(models_path_prefix, "google/gemma-2-2b-it"),  # sliding window + full attention
    ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])

--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -4,9 +4,11 @@ from __future__ import annotations
 import random
 from typing import Any
+import os
 import pytest
 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix
 @pytest.fixture
@@ -49,14 +51,17 @@ def sampling_config():
 @pytest.fixture
 def model_name():
+    # return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
    return "meta-llama/Llama-3.1-8B-Instruct"
 def eagle_model_name():
+    # return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
 def eagle3_model_name():
+    # return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"

--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import pytest
 from vllm import LLM, SamplingParams
-from ...utils import fork_new_process_for_each_test
+from ...utils import fork_new_process_for_each_test, models_path_prefix
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("attn_backend",
-                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+                         ["FLASH_ATTN_VLLM_V1"]) #  "FLASHINFER_VLLM_V1"
 def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        llm = LLM(model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"))
        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
        # No cascade attention.

--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -3,11 +3,13 @@
 import random
 from typing import Optional
+import os
 import pytest
 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix
-MODEL = "facebook/opt-125m"
+MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
 DTYPE = "half"

--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                             RequestOutputCollector)
 from vllm.v1.metrics.stats import IterationStats
+from ...utils import models_path_prefix
 def _ref_convert_id_to_token(
@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
        dummy_test_vectors: dummy engine core outputs and other data structures
    """
    model_id = dummy_test_vectors.tokenizer.name_or_path
-    if model_id != 'meta-llama/Llama-3.2-1B':
+    if model_id != os.path.join(models_path_prefix, 'meta-llama/Llama-3.2-1B'):
        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
                             f"{model_id} is in use.")
    do_logprobs = num_sample_logprobs is not None

--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -7,6 +7,7 @@ import re
 from enum import Enum
 from typing import Any
+import os
 import jsonschema
 import pytest
 from pydantic import BaseModel
@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from ....utils import models_path_prefix
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "guidance:disable-any-whitespace",
     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
     "mistral"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "xgrammar:disable-any-whitespace", "auto"),
    #FIXME: This test is flaky on CI thus disabled
    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
 PARAMS_MODELS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "auto"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
+    (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "auto"),
 ]

--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -3,6 +3,7 @@
 import itertools
 from collections.abc import Generator
+import os
 import pytest
 import torch
@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
 from vllm import SamplingParams
 from ...conftest import HfRunner, VllmRunner
+from ...utils import models_path_prefix
-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 DTYPE = "half"
 NONE = BatchLogprobsComposition.NONE

--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import lm_eval
-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix
 # arc-easy uses prompt_logprobs=1, logprobs=1
 TASK = "arc_easy"
@@ -11,7 +12,7 @@ RTOL = 0.03
 EXPECTED_VALUE = 0.62
 # FIXME(rob): enable prefix caching once supported.
-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
 SERVER_ARGS = [
    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"

--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -4,11 +4,12 @@ import os
 import pytest
 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix
 if os.getenv("VLLM_USE_V1", "0") != "1":
    pytest.skip("Test package requires V1", allow_module_level=True)
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 PROMPT = "Hello my name is Robert and I"

--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py