Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import pytest import pytest
import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.transformers_utils.detokenizer import (Detokenizer, from vllm.transformers_utils.detokenizer import (Detokenizer,
detokenize_incrementally) detokenize_incrementally)
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from ..utils import models_path_prefix
TRUTH = [ TRUTH = [
"Hello here, this is a simple test", "Hello here, this is a simple test",
...@@ -14,16 +16,16 @@ TRUTH = [ ...@@ -14,16 +16,16 @@ TRUTH = [
"我很感谢你的热情" "我很感谢你的热情"
] ]
TOKENIZERS = [ TOKENIZERS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
"gpt2", os.path.join(models_path_prefix, "gpt2"),
"bigcode/tiny_starcoder_py", os.path.join(models_path_prefix, "bigcode/tiny_starcoder_py"),
"EleutherAI/gpt-j-6b", os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
"EleutherAI/pythia-70m", os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
"bigscience/bloom-560m", os.path.join(models_path_prefix, "bigscience/bloom-560m"),
"mosaicml/mpt-7b", os.path.join(models_path_prefix, "mosaicml/mpt-7b"),
"tiiuae/falcon-7b", os.path.join(models_path_prefix, "tiiuae/falcon-7b"),
"meta-llama/Llama-2-7b-hf", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
"codellama/CodeLlama-7b-hf", os.path.join(models_path_prefix, "codellama/CodeLlama-7b-hf"),
] ]
...@@ -225,7 +227,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], ...@@ -225,7 +227,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
]) ])
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
def test_decode_prompt_logprobs_chunked_prefill( def test_decode_prompt_logprobs_chunked_prefill(
vllm_runner, vllm_runner,
......
...@@ -5,10 +5,12 @@ only get the `eos_token_id` from the tokenizer as defined by ...@@ -5,10 +5,12 @@ only get the `eos_token_id` from the tokenizer as defined by
""" """
from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.config import try_get_generation_config
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
import os
def test_get_llama3_eos_token(): def test_get_llama3_eos_token():
model_name = "meta-llama/Meta-Llama-3-8B-Instruct" model_name = os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer = get_tokenizer(model_name) tokenizer = get_tokenizer(model_name)
assert tokenizer.eos_token_id == 128009 assert tokenizer.eos_token_id == 128009
...@@ -20,7 +22,7 @@ def test_get_llama3_eos_token(): ...@@ -20,7 +22,7 @@ def test_get_llama3_eos_token():
def test_get_blip2_eos_token(): def test_get_blip2_eos_token():
model_name = "Salesforce/blip2-opt-2.7b" model_name = os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")
tokenizer = get_tokenizer(model_name) tokenizer = get_tokenizer(model_name)
assert tokenizer.eos_token_id == 2 assert tokenizer.eos_token_id == 2
......
import pytest import pytest
import os
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
TOKENIZER_NAMES = [ TOKENIZER_NAMES = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
"gpt2", os.path.join(models_path_prefix, "gpt2"),
] ]
......
...@@ -13,6 +13,7 @@ from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( ...@@ -13,6 +13,7 @@ from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool) RayTokenizerGroupPool)
from ..conftest import get_tokenizer_pool_config from ..conftest import get_tokenizer_pool_config
from ..utils import models_path_prefix
class CustomTokenizerGroup(TokenizerGroup): class CustomTokenizerGroup(TokenizerGroup):
...@@ -30,7 +31,7 @@ class CustomTokenizerGroup(TokenizerGroup): ...@@ -30,7 +31,7 @@ class CustomTokenizerGroup(TokenizerGroup):
@pytest.mark.parametrize("tokenizer_group_type", @pytest.mark.parametrize("tokenizer_group_type",
[None, "ray", CustomTokenizerGroup]) [None, "ray", CustomTokenizerGroup])
async def test_tokenizer_group(tokenizer_group_type): async def test_tokenizer_group(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group = get_tokenizer_group( tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type), get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2", tokenizer_id="gpt2",
...@@ -54,7 +55,7 @@ async def test_tokenizer_group(tokenizer_group_type): ...@@ -54,7 +55,7 @@ async def test_tokenizer_group(tokenizer_group_type):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("tokenizer_group_type", ["ray"]) @pytest.mark.parametrize("tokenizer_group_type", ["ray"])
async def test_tokenizer_group_pool(tokenizer_group_type): async def test_tokenizer_group_pool(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group_pool = get_tokenizer_group( tokenizer_group_pool = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type), get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2", tokenizer_id="gpt2",
......
import os import os
from ..utils import compare_two_settings from ..utils import compare_two_settings, models_path_prefix
# --enforce-eager on TPU causes graph compilation # --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine, # this times out default Health Check in the MQLLMEngine,
...@@ -9,7 +9,7 @@ os.environ["VLLM_RPC_TIMEOUT"] = "30000" ...@@ -9,7 +9,7 @@ os.environ["VLLM_RPC_TIMEOUT"] = "30000"
def test_custom_dispatcher(): def test_custom_dispatcher():
compare_two_settings("google/gemma-2b", compare_two_settings(os.path.join(models_path_prefix, "google/gemma-2b"),
arg1=["--enforce-eager"], arg1=["--enforce-eager"],
arg2=["--enforce-eager"], arg2=["--enforce-eager"],
env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"}, env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
......
...@@ -15,6 +15,7 @@ from opentelemetry.sdk.environment_variables import ( ...@@ -15,6 +15,7 @@ from opentelemetry.sdk.environment_variables import (
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.tracing import SpanAttributes from vllm.tracing import SpanAttributes
from ..utils import models_path_prefix
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
...@@ -73,7 +74,7 @@ def test_traces(trace_service): ...@@ -73,7 +74,7 @@ def test_traces(trace_service):
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
model = "facebook/opt-125m" model = os.path.join(models_path_prefix, "facebook/opt-125m")
llm = LLM( llm = LLM(
model=model, model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
...@@ -129,7 +130,7 @@ def test_traces_with_detailed_steps(trace_service): ...@@ -129,7 +130,7 @@ def test_traces_with_detailed_steps(trace_service):
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
model = "facebook/opt-125m" model = os.path.join(models_path_prefix, "facebook/opt-125m")
llm = LLM( llm = LLM(
model=model, model=model,
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
......
...@@ -25,6 +25,11 @@ from vllm.model_executor.model_loader.loader import get_model_loader ...@@ -25,6 +25,11 @@ from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless, from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
get_open_port, is_hip) get_open_port, is_hip)
import vllm.envs as envs
import os
models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_MODELS_PATH")
if current_platform.is_rocm(): if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage, from amdsmi import (amdsmi_get_gpu_vram_usage,
......
import os import os
import torch import torch
from ..utils import models_path_prefix
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
MODEL_NAME = os.environ.get("MODEL_NAME", MODEL_NAME = os.environ.get("MODEL_NAME",
"robertgshaw2/zephyr-7b-beta-channelwise-gptq") os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"))
REVISION = os.environ.get("REVISION", "main") REVISION = os.environ.get("REVISION", "main")
QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin") QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
......
...@@ -3,12 +3,14 @@ from typing import List ...@@ -3,12 +3,14 @@ from typing import List
import pytest import pytest
import torch import torch
import os
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_cpu, make_tensor_with_pad from vllm.utils import is_cpu, make_tensor_with_pad
from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.model_runner import _get_graph_batch_size
from ..utils import models_path_prefix
BATCH_SIZES = [1, 4, 16, 64, 256] BATCH_SIZES = [1, 4, 16, 64, 256]
...@@ -40,7 +42,7 @@ def test_empty_seq_group(): ...@@ -40,7 +42,7 @@ def test_empty_seq_group():
for empty seq group list""" for empty seq group list"""
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/bart-base", os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0, seed=0,
dtype="float16", dtype="float16",
max_num_batched_tokens=100000, max_num_batched_tokens=100000,
...@@ -99,7 +101,7 @@ def test_prepare_prompt(batch_size): ...@@ -99,7 +101,7 @@ def test_prepare_prompt(batch_size):
''' '''
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/bart-base", os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0, seed=0,
dtype="float16", dtype="float16",
max_num_batched_tokens=100000, max_num_batched_tokens=100000,
...@@ -291,7 +293,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): ...@@ -291,7 +293,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
''' '''
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/bart-base", os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0, seed=0,
dtype="float16", dtype="float16",
max_num_batched_tokens=100000, max_num_batched_tokens=100000,
...@@ -494,7 +496,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): ...@@ -494,7 +496,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
for varying input batch sizes. for varying input batch sizes.
""" """
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/bart-base", os.path.join(models_path_prefix, "facebook/bart-base"),
seed=0, seed=0,
dtype="float16", dtype="float16",
max_num_batched_tokens=100000, max_num_batched_tokens=100000,
......
...@@ -2,6 +2,7 @@ from typing import List ...@@ -2,6 +2,7 @@ from typing import List
import pytest import pytest
import torch import torch
import os
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment) init_distributed_environment)
...@@ -10,6 +11,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -10,6 +11,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import get_open_port from vllm.utils import get_open_port
from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
from ..utils import models_path_prefix
def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
...@@ -33,7 +35,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: ...@@ -33,7 +35,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
@pytest.mark.parametrize("batch_size", list(range(1, 257))) @pytest.mark.parametrize("batch_size", list(range(1, 257)))
def test_prepare_prompt(batch_size): def test_prepare_prompt(batch_size):
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=100000, max_num_batched_tokens=100000,
max_num_seqs=100000, max_num_seqs=100000,
enable_chunked_prefill=False, enable_chunked_prefill=False,
...@@ -147,7 +149,7 @@ def test_prepare_prompt(batch_size): ...@@ -147,7 +149,7 @@ def test_prepare_prompt(batch_size):
@pytest.mark.parametrize("batch_size", list(range(1, 257))) @pytest.mark.parametrize("batch_size", list(range(1, 257)))
def test_prepare_decode_cuda_graph(batch_size): def test_prepare_decode_cuda_graph(batch_size):
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0, seed=0,
dtype="float16", dtype="float16",
enforce_eager=False, enforce_eager=False,
...@@ -256,7 +258,7 @@ def test_prepare_decode_cuda_graph(batch_size): ...@@ -256,7 +258,7 @@ def test_prepare_decode_cuda_graph(batch_size):
def test_empty_seq_group(): def test_empty_seq_group():
"""Verify prepare prompt and decode returns empty output.""" """Verify prepare prompt and decode returns empty output."""
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0, seed=0,
dtype="float16", dtype="float16",
enforce_eager=False, enforce_eager=False,
...@@ -301,7 +303,7 @@ def distributed_init(): ...@@ -301,7 +303,7 @@ def distributed_init():
@pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.parametrize("enforce_eager", [True, False])
def test_hybrid_batches(batch_size, enforce_eager, distributed_init): def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
model_runner = _create_model_runner( model_runner = _create_model_runner(
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
seed=0, seed=0,
dtype="float16", dtype="float16",
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
......
import torch import torch
import os
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.sequence import ExecuteModelRequest from vllm.sequence import ExecuteModelRequest
from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.worker import Worker from vllm.worker.worker import Worker
from ..utils import models_path_prefix
def test_swap() -> None: def test_swap() -> None:
# Configure the engine. # Configure the engine.
engine_args = EngineArgs(model="facebook/opt-125m", engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
dtype="half", dtype="half",
load_format="dummy") load_format="dummy")
engine_config = engine_args.create_engine_config() engine_config = engine_args.create_engine_config()
......
...@@ -15,6 +15,7 @@ if TYPE_CHECKING: ...@@ -15,6 +15,7 @@ if TYPE_CHECKING:
VLLM_USE_OPT_OP: bool = False VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
VLLM_OPTEST_MODELS_PATH: str = ""
LOCAL_RANK: int = 0 LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None CUDA_VISIBLE_DEVICES: Optional[str] = None
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
...@@ -213,6 +214,11 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -213,6 +214,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_USE_PA_PRINT_PARAM": "VLLM_USE_PA_PRINT_PARAM":
lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in lambda: (os.environ.get("VLLM_USE_PA_PRINT_PARAM", "False").lower() in
("true", "1")), ("true", "1")),
# Path to the optest models.
# If set, will load models from local path instead of Hugging Face Hub.
'VLLM_OPTEST_MODELS_PATH':
lambda: os.getenv('VLLM_OPTEST_MODELS_PATH', "") or os.getenv("OPTEST_MODELS_PATH", ""),
# If set, allowing the use of deprecated beam search implementation # If set, allowing the use of deprecated beam search implementation
"VLLM_ALLOW_DEPRECATED_BEAM_SEARCH": "VLLM_ALLOW_DEPRECATED_BEAM_SEARCH":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment