add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...
add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub
3c9817d2 · zhuwenwen · 49204f68 · 3c9817d2 · 3c9817d2 · 3c9817d2
Commit 3c9817d2 authored Nov 27, 2024 by zhuwenwen
20 changed files
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
 import weakref
 from typing import List
+import os

 import pytest

@@ -7,8 +8,9 @@ from vllm import LLM, RequestOutput, SamplingParams

 from ...conftest import cleanup
 from ..openai.test_vision import TEST_IMAGE_URLS
+from ...utils import models_path_prefix

-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")

 PROMPTS = [
    "Hello, my name is",
@@ -145,7 +147,7 @@ def test_multiple_sampling_params(llm: LLM):

 def test_chat():

-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))

    prompt1 = "Explain the concept of entropy."
    messages = [
@@ -164,7 +166,7 @@ def test_chat():

 def test_multi_chat():

-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"))

    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."
@@ -201,7 +203,7 @@ def test_multi_chat():
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
    llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
+        model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,

--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
 import weakref

 import pytest
+import os
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download

@@ -8,8 +9,9 @@ from vllm import LLM
 from vllm.lora.request import LoRARequest

 from ...conftest import cleanup
+from ...utils import models_path_prefix

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")

 PROMPTS = [
    "Hello, my name is",
@@ -18,7 +20,7 @@ PROMPTS = [
    "The future of AI is",
 ]

-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -4,14 +4,16 @@ import weakref

 import jsonschema
 import pytest
+import os

 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams

 from ...conftest import cleanup
+from ...utils import models_path_prefix

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
 import sys
+import os

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix


 def test_lazy_outlines(sample_regex):
@@ -14,7 +16,7 @@ def test_lazy_outlines(sample_regex):
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
              enforce_eager=True,
              gpu_memory_utilization=0.3)
    outputs = llm.generate(prompts, sampling_params)
@@ -26,7 +28,7 @@ def test_lazy_outlines(sample_regex):
    # make sure outlines is not imported
    assert 'outlines' not in sys.modules

-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)

--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
 import pytest
+import os

 from vllm import LLM
+from ...utils import models_path_prefix


 def test_empty_prompt():
-    llm = LLM(model="gpt2")
+    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"))
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -2,14 +2,16 @@
 import importlib
 import sys
 import weakref
+import os

 import pytest

 from vllm import LLM

 from ...conftest import cleanup
+from ...utils import models_path_prefix

-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = os.path.join(models_path_prefix, "facebook/opt-125m")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -9,10 +9,12 @@ AsyncLLMEngine are working correctly.

 import lm_eval
 import pytest
+import os

 from ...utils import RemoteOpenAIServer
+from ...utils import models_path_prefix

-MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -2,14 +2,15 @@ from typing import Dict, List

 import openai
 import pytest
+import os
 import pytest_asyncio

 from vllm.assets.audio import AudioAsset
 from vllm.multimodal.utils import encode_audio_base64, fetch_audio

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
 TEST_AUDIO_URLS = [
    AudioAsset("winning_call").url,
 ]

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -7,9 +7,9 @@ import requests

 from vllm.version import __version__ as VLLM_VERSION

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,19 +6,20 @@ from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import os
 import pytest_asyncio
 import torch
 from openai import BadRequestError

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
 from .test_completion import zephyr_lora_files  # noqa: F401

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora") 


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
 import pytest
+import os

 from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
                                         load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer

-from ...utils import VLLM_PATH
+from ...utils import VLLM_PATH, models_path_prefix

 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()

 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -20,7 +21,7 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), chatml_jinja_path, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -8,22 +8,23 @@ from typing import Dict, List, Optional
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
+import os
 import pytest_asyncio
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
+# from huggingface_hub import snapshot_download
 from openai import BadRequestError
 from transformers import AutoTokenizer

 from vllm.transformers_utils.tokenizer import get_tokenizer

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-PA_NAME = "swapnilbp/llama_tweet_ptune"
+LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")
+PA_NAME = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune")
 # if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
 # need to change to match the prompt adapter
 PA_NUM_VIRTUAL_TOKENS = 8
@@ -31,7 +32,8 @@ PA_NUM_VIRTUAL_TOKENS = 8

 @pytest.fixture(scope="module")
 def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
+    # return snapshot_download(repo_id=LORA_NAME)
+    return LORA_NAME


 @pytest.fixture(scope="module")
@@ -52,7 +54,8 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):

 @pytest.fixture(scope="module")
 def zephyr_pa_files():
-    return snapshot_download(repo_id=PA_NAME)
+    # return snapshot_download(repo_id=PA_NAME)
+    return PA_NAME


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,11 +3,12 @@ import base64
 import numpy as np
 import openai
 import pytest
+import os
 import pytest_asyncio

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

-EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+EMBEDDING_MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
 import openai
 import pytest
+import os
 import pytest_asyncio

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = "facebook/bart-base"
+MODEL_NAME = os.path.join(models_path_prefix, "facebook/bart-base")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
 # This unit test should be moved to a new
 # tests/test_guided_decoding directory.
 import pytest
+import os
 import torch
 from transformers import AutoTokenizer

@@ -9,11 +10,12 @@ from vllm.model_executor.guided_decoding import (
    get_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
    JSONLogitsProcessor, RegexLogitsProcessor)
+from ...utils import models_path_prefix


 def test_guided_logits_processors(sample_regex, sample_json_schema):
    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
    json_LP = JSONLogitsProcessor(sample_json_schema,
                                  tokenizer,
@@ -41,7 +43,7 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
 async def test_guided_logits_processor_black_box(backend: str, sample_regex,
                                                 sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
    token_ids = tokenizer.encode(
        f"Give an example IPv4 address with this regex: {sample_regex}")
    regex_request = CompletionRequest(model='test',

--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
@@ -2,17 +2,18 @@ import json

 import openai  # use the official client for correctness check
 import pytest
+import os
 import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -6,14 +6,15 @@ from http import HTTPStatus

 import openai
 import pytest
+import os
 import pytest_asyncio
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODEL_NAME = os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")


 @pytest.fixture(scope="module")
@@ -211,7 +212,7 @@ def test_metrics_exist_run_batch():
            "-o",
            output_file.name,
            "--model",
-            "intfloat/e5-mistral-7b-instruct",
+            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
            "--enable-metrics",
            "--url",
            base_url,

--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
 import openai  # use the official client for correctness check
 import pytest
+import os
 import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")


 @pytest.fixture(scope="module")
 def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
+    # return snapshot_download(repo_id=LORA_NAME)
+    return LORA_NAME


 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
-from ...utils import VLLM_PATH, RemoteOpenAIServer
-import vllm.envs as envs
+import os
+from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, envs

 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
@@ -54,5 +54,5 @@ def run_and_test_dummy_opt_api_server(model, tp=1):


 def test_oot_registration_for_api_server(dummy_opt_path: str):
-    dummy_opt_path="facebook/opt-125m"
+    dummy_opt_path = os.path.join(models_path_prefix, "facebook/opt-125m")
    run_and_test_dummy_opt_api_server(dummy_opt_path)
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -4,12 +4,12 @@ import re
 import openai
 import pytest

-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix


 @pytest.mark.asyncio
 async def test_empty_prompt():
-    model_name = "gpt2"
+    model_name = os.path.join(models_path_prefix, "gpt2")
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()