Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
from typing import Any, Dict, List, Optional
import pytest
import os
from transformers import AutoTokenizer
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.transformers_utils.detokenizer import (Detokenizer,
detokenize_incrementally)
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from ..utils import models_path_prefix
TRUTH = [
"Hello here, this is a simple test",
......@@ -14,16 +16,16 @@ TRUTH = [
"我很感谢你的热情"
]
TOKENIZERS = [
"facebook/opt-125m",
"gpt2",
"bigcode/tiny_starcoder_py",
"EleutherAI/gpt-j-6b",
"EleutherAI/pythia-70m",
"bigscience/bloom-560m",
"mosaicml/mpt-7b",
"tiiuae/falcon-7b",
"meta-llama/Llama-2-7b-hf",
"codellama/CodeLlama-7b-hf",
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "gpt2"),
os.path.join(models_path_prefix, "bigcode/tiny_starcoder_py"),
os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
os.path.join(models_path_prefix, "bigscience/bloom-560m"),
os.path.join(models_path_prefix, "mosaicml/mpt-7b"),
os.path.join(models_path_prefix, "tiiuae/falcon-7b"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
os.path.join(models_path_prefix, "codellama/CodeLlama-7b-hf"),
]
......@@ -225,7 +227,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
])
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
def test_decode_prompt_logprobs_chunked_prefill(
vllm_runner,
......
......@@ -5,10 +5,12 @@ only get the `eos_token_id` from the tokenizer as defined by
"""
from vllm.transformers_utils.config import try_get_generation_config
from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
import os
def test_get_llama3_eos_token():
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer = get_tokenizer(model_name)
assert tokenizer.eos_token_id == 128009
......@@ -20,7 +22,7 @@ def test_get_llama3_eos_token():
def test_get_blip2_eos_token():
model_name = "Salesforce/blip2-opt-2.7b"
model_name = os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")
tokenizer = get_tokenizer(model_name)
assert tokenizer.eos_token_id == 2
......
import pytest
import os
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
TOKENIZER_NAMES = [
"facebook/opt-125m",
"gpt2",
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "gpt2"),
]
......
......@@ -13,6 +13,7 @@ from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)
from ..conftest import get_tokenizer_pool_config
from ..utils import models_path_prefix
class CustomTokenizerGroup(TokenizerGroup):
......@@ -30,7 +31,7 @@ class CustomTokenizerGroup(TokenizerGroup):
@pytest.mark.parametrize("tokenizer_group_type",
[None, "ray", CustomTokenizerGroup])
async def test_tokenizer_group(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2",
......@@ -54,7 +55,7 @@ async def test_tokenizer_group(tokenizer_group_type):
@pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
async def test_tokenizer_group_pool(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group_pool = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2",
......
import os
from ..utils import compare_two_settings
from ..utils import compare_two_settings, models_path_prefix
# --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine,
......@@ -9,7 +9,7 @@ os.environ["VLLM_RPC_TIMEOUT"] = "30000"
def test_custom_dispatcher():
compare_two_settings("google/gemma-2b",
compare_two_settings(os.path.join(models_path_prefix, "google/gemma-2b"),
arg1=["--enforce-eager"],
arg2=["--enforce-eager"],
env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
......
......@@ -15,6 +15,7 @@ from opentelemetry.sdk.environment_variables import (
from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes
from ..utils import models_path_prefix
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
......@@ -73,7 +74,7 @@ def test_traces(trace_service):
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
model = "facebook/opt-125m"
model = os.path.join(models_path_prefix, "facebook/opt-125m")
llm = LLM(
model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
......@@ -129,7 +130,7 @@ def test_traces_with_detailed_steps(trace_service):
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
model = "facebook/opt-125m"
model = os.path.join(models_path_prefix, "facebook/opt-125m")
llm = LLM(
model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
......
......@@ -25,6 +25,11 @@ from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
get_open_port, is_hip)
import vllm.envs as envs
import os
models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage,
......
import os
import torch
from ..utils import models_path_prefix
MAX_MODEL_LEN = 1024
MODEL_NAME = os.environ.get("MODEL_NAME",
"robertgshaw2/zephyr-7b-beta-channelwise-gptq")
os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"))
REVISION = os.environ.get("REVISION", "main")
QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
......
......@@ -3,12 +3,14 @@ from typing import List
import pytest
import torch
import os
from vllm.engine.arg_utils import EngineArgs
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_cpu, make_tensor_with_pad
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import _get_graph_batch_size
from ..utils import models_path_prefix
BATCH_SIZES = [1, 4, 16, 64, 256]
......@@ -40,7 +42,7 @@ def test_empty_seq_group():
for empty seq group list"""
model_runner = _create_model_runner(
"facebook/bart-base",
os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0,
dtype="float16",
max_num_batched_tokens=100000,
......@@ -99,7 +101,7 @@ def test_prepare_prompt(batch_size):
'''
model_runner = _create_model_runner(
"facebook/bart-base",
os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0,
dtype="float16",
max_num_batched_tokens=100000,
......@@ -291,7 +293,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
'''
model_runner = _create_model_runner(
"facebook/bart-base",
os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0,
dtype="float16",
max_num_batched_tokens=100000,
......@@ -494,7 +496,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
for varying input batch sizes.
"""
model_runner = _create_model_runner(
"facebook/bart-base",
os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0,
dtype="float16",
max_num_batched_tokens=100000,
......
......@@ -2,6 +2,7 @@ from typing import List
import pytest
import torch
import os
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment)
......@@ -10,6 +11,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import get_open_port
from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
from ..utils import models_path_prefix
def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
......@@ -33,7 +35,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
@pytest.mark.parametrize("batch_size", list(range(1, 257)))
def test_prepare_prompt(batch_size):
model_runner = _create_model_runner(
"facebook/opt-125m",
os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=100000,
max_num_seqs=100000,
enable_chunked_prefill=False,
......@@ -147,7 +149,7 @@ def test_prepare_prompt(batch_size):
@pytest.mark.parametrize("batch_size", list(range(1, 257)))
def test_prepare_decode_cuda_graph(batch_size):
model_runner = _create_model_runner(
"facebook/opt-125m",
os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0,
dtype="float16",
enforce_eager=False,
......@@ -256,7 +258,7 @@ def test_prepare_decode_cuda_graph(batch_size):
def test_empty_seq_group():
"""Verify prepare prompt and decode returns empty output."""
model_runner = _create_model_runner(
"facebook/opt-125m",
os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0,
dtype="float16",
enforce_eager=False,
......@@ -301,7 +303,7 @@ def distributed_init():
@pytest.mark.parametrize("enforce_eager", [True, False])
def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
model_runner = _create_model_runner(
"facebook/opt-125m",
os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0,
dtype="float16",
enforce_eager=enforce_eager,
......
import torch
import os
from vllm.engine.arg_utils import EngineArgs
from vllm.sequence import ExecuteModelRequest
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.worker import Worker
from ..utils import models_path_prefix
def test_swap() -> None:
# Configure the engine.
engine_args = EngineArgs(model="facebook/opt-125m",
engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
dtype="half",
load_format="dummy")
engine_config = engine_args.create_engine_config()
......
......@@ -15,6 +15,7 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_OPTEST_MODELS_PATH: str = ""
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
......@@ -213,6 +214,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_USE_PA_PRINT_PARAM":
lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
("true", "1")),
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH':
lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),
# If set, allowing the use of deprecated beam search implementation
"VLLM_ALLOW_DEPRECATED_BEAM_SEARCH":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment