Merge remote-tracking branch 'origin/v0.8.5.post1-dev' into v0.8.5.post1-dev

8e340b4f · yangql · 1cb37dab · a68aef25 · 8e340b4f · 8e340b4f
Commit 8e340b4f authored Jun 05, 2025 by yangql
16 changed files
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import glob
 import tempfile

@@ -9,6 +10,7 @@ import torch
 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf, runai_safetensors_weights_iterator,
    safetensors_weights_iterator)
+from ..utils import models_path_prefix


 def test_runai_model_loader():
@@ -23,10 +25,10 @@ def test_runai_model_loader():
        runai_model_streamer_tensors = {}
        hf_safetensors_tensors = {}

-        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+        for name, tensor in runai_safetensors_weights_iterator(safetensors, False):
            runai_model_streamer_tensors[name] = tensor

-        for name, tensor in safetensors_weights_iterator(safetensors):
+        for name, tensor in safetensors_weights_iterator(safetensors, False):
            hf_safetensors_tensors[name] = tensor

        assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
@@ -38,4 +40,4 @@ def test_runai_model_loader():


 if __name__ == "__main__":
-    test_runai_model_loader()
+    test_runai_model_loader()
\ No newline at end of file
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -43,7 +43,8 @@ def _generate(


 class TestOneTokenBadWord:
-    MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
+    # MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
+    MODEL = "TheBloke/Llama-2-7B-fp16"

    PROMPT = "Hi! How are"
    TARGET_TOKEN = "you"
@@ -191,4 +192,4 @@ class TestTwoTokenBadWord:
                prompt: str,
                add_special_tokens: bool = True) -> list[int]:
        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
+                              add_special_tokens=add_special_tokens).input_ids
\ No newline at end of file
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -7,16 +7,15 @@ import pathlib
 import subprocess
 from functools import partial
 from unittest.mock import MagicMock, patch
-from typing import List, Tuple, Optional

 import openai
 import pytest
 import torch
 from huggingface_hub import snapshot_download
+from typing import List, Tuple, Optional

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.lora.request import LoRARequest
 # yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                         open_stream,
                                                         serialize_vllm_model,
                                                         tensorize_vllm_model)
+from vllm.lora.request import LoRARequest
+
 # yapf: enable
 from vllm.utils import PlaceholderModule, import_from_path

@@ -245,7 +246,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
    )

-    model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
+    model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") 
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
    lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
    test_prompts = multilora_inference.create_test_prompts(lora_path)

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
            AutoTokenizer.from_pretrained(tokenizer_name))


-@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
+@pytest.mark.parametrize("tokenizer_name", [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")])
 @pytest.mark.parametrize(
    "truth",
    [
@@ -403,4 +403,4 @@ def test_decode_prompt_logprobs_chunked_prefill(
                generated_string += prompt_logprobs[prompt_token].decoded_token

            assert generated_string == example_prompts[idx], (
-                "Detokenized prompt logprobs do not match original prompt")
+                "Detokenized prompt logprobs do not match original prompt")
\ No newline at end of file
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup


+# export HF_ENDPOINT=https://hf-mirror.com
 @pytest.mark.asyncio
 async def test_tokenizer_group():
-    reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
+    # reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
+    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer_group = TokenizerGroup(
-        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
+        # tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass

+import os
 import pytest

 from vllm import LLM, SamplingParams

 from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
                                                               prep_prompts)
+from ...utils import models_path_prefix


 @dataclass
@@ -16,16 +18,16 @@ class TestConfig:


 model_config = {
-    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
-    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): TestConfig(4096, (800, 1100)),
+    os.path.join(models_path_prefix, "google/gemma-2-2b-it"): TestConfig(4096, (400, 800)),
 }


 @pytest.mark.parametrize(
    "model",
    [
-        "bigcode/starcoder2-3b",  # sliding window only
-        "google/gemma-2-2b-it",  # sliding window + full attention
+        os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),  # sliding window only
+        os.path.join(models_path_prefix, "google/gemma-2-2b-it"),  # sliding window + full attention
    ])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])

--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -4,9 +4,11 @@ from __future__ import annotations
 import random
 from typing import Any

+import os
 import pytest

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix


 @pytest.fixture
@@ -49,14 +51,17 @@ def sampling_config():

 @pytest.fixture
 def model_name():
+    # return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
    return "meta-llama/Llama-3.1-8B-Instruct"


 def eagle_model_name():
+    # return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"


 def eagle3_model_name():
+    # return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"



--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest

 from vllm import LLM, SamplingParams

-from ...utils import fork_new_process_for_each_test
+from ...utils import fork_new_process_for_each_test, models_path_prefix


 @fork_new_process_for_each_test
 @pytest.mark.parametrize("attn_backend",
-                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+                         ["FLASH_ATTN_VLLM_V1"]) #  "FLASHINFER_VLLM_V1"
 def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"

@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        llm = LLM(model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"))
        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)

        # No cascade attention.
@@ -29,4 +30,4 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        prompts = [example_system_message + prompt] * 64
        responses = llm.generate(prompts, sampling_params)
        for response in responses:
-            assert response.outputs[0].text == ref_output
+            assert response.outputs[0].text == ref_output
\ No newline at end of file
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -3,11 +3,13 @@
 import random
 from typing import Optional

+import os
 import pytest

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix

-MODEL = "facebook/opt-125m"
+MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
 DTYPE = "half"


@@ -96,4 +98,4 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
            }
            raise AssertionError(
                f"{len(completion_counts)} unique completions; expected"
-                f" {n}. Repeats: {repeats}")
+                f" {n}. Repeats: {repeats}")
\ No newline at end of file
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                             RequestOutputCollector)
 from vllm.v1.metrics.stats import IterationStats
+from ...utils import models_path_prefix


 def _ref_convert_id_to_token(
@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
        dummy_test_vectors: dummy engine core outputs and other data structures
    """
    model_id = dummy_test_vectors.tokenizer.name_or_path
-    if model_id != 'meta-llama/Llama-3.2-1B':
+    if model_id != os.path.join(models_path_prefix, 'meta-llama/Llama-3.2-1B'):
        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
                             f"{model_id} is in use.")
    do_logprobs = num_sample_logprobs is not None
@@ -992,4 +993,4 @@ async def test_cumulative_output_collector_n():
    # Third is the one where index is 2
    third = [k for k in result.outputs if k.index == 2]
    assert len(third) == 1
-    assert third[0].text == "c"
+    assert third[0].text == "c"
\ No newline at end of file
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -7,6 +7,7 @@ import re
 from enum import Enum
 from typing import Any

+import os
 import jsonschema
 import pytest
 from pydantic import BaseModel
@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from ....utils import models_path_prefix

 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "guidance:disable-any-whitespace",
     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
     "mistral"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "xgrammar:disable-any-whitespace", "auto"),
    #FIXME: This test is flaky on CI thus disabled
    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]

 PARAMS_MODELS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
+    (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "auto"),
+    (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "auto"),
 ]


@@ -572,4 +574,4 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
    assert "a3" in generated
    assert "a4" not in generated
    assert "a5" not in generated
-    assert "a6" not in generated
+    assert "a6" not in generated
\ No newline at end of file
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -3,6 +3,7 @@
 import itertools
 from collections.abc import Generator

+import os
 import pytest
 import torch

@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
 from vllm import SamplingParams

 from ...conftest import HfRunner, VllmRunner
+from ...utils import models_path_prefix

-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 DTYPE = "half"

 NONE = BatchLogprobsComposition.NONE

--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import lm_eval

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

 # arc-easy uses prompt_logprobs=1, logprobs=1
 TASK = "arc_easy"
@@ -11,7 +12,7 @@ RTOL = 0.03
 EXPECTED_VALUE = 0.62

 # FIXME(rob): enable prefix caching once supported.
-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
 SERVER_ARGS = [
    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
@@ -49,4 +50,4 @@ def test_promt_logprobs_e2e_server():
        measured_value = results["results"][TASK][FILTER]
        assert (measured_value - RTOL < EXPECTED_VALUE
                and measured_value + RTOL > EXPECTED_VALUE
-                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
\ No newline at end of file
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -4,11 +4,12 @@ import os
 import pytest

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix

 if os.getenv("VLLM_USE_V1", "0") != "1":
    pytest.skip("Test package requires V1", allow_module_level=True)

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 PROMPT = "Hello my name is Robert and I"



--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py