Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
...@@ -6,6 +6,8 @@ from pathlib import Path ...@@ -6,6 +6,8 @@ from pathlib import Path
import pytest import pytest
import requests import requests
import os
from ..utils import models_path_prefix
def _query_server(prompt: str, max_tokens: int = 5) -> dict: def _query_server(prompt: str, max_tokens: int = 5) -> dict:
...@@ -30,7 +32,7 @@ def api_server(tokenizer_pool_size: int, worker_use_ray: bool): ...@@ -30,7 +32,7 @@ def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
"api_server_async_engine.py").absolute() "api_server_async_engine.py").absolute()
commands = [ commands = [
sys.executable, "-u", sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host", str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host",
"127.0.0.1", "--tokenizer-pool-size", "127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size) str(tokenizer_pool_size)
] ]
......
...@@ -18,7 +18,8 @@ from vllm.sampling_params import RequestOutputKind ...@@ -18,7 +18,8 @@ from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear from ..utils import wait_for_gpu_memory_to_clear
import os
from ..utils import models_path_prefix
@dataclass @dataclass
class RequestOutput: class RequestOutput:
...@@ -136,7 +137,7 @@ def start_engine(): ...@@ -136,7 +137,7 @@ def start_engine():
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}") print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
return AsyncLLMEngine.from_engine_args( return AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m", AsyncEngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
enforce_eager=True, enforce_eager=True,
num_scheduler_steps=num_scheduler_steps)) num_scheduler_steps=num_scheduler_steps))
......
...@@ -16,10 +16,12 @@ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata ...@@ -16,10 +16,12 @@ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
MODELS = [ MODELS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
"meta-llama/Llama-2-7b-hf", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
] ]
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
...@@ -27,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") ...@@ -27,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m") llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
# If there's any circular reference to vllm, this fails # If there's any circular reference to vllm, this fails
...@@ -78,14 +80,14 @@ def test_models( ...@@ -78,14 +80,14 @@ def test_models(
# @pytest.mark.parametrize( # @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, " # "model, distributed_executor_backend, attention_backend, "
# "test_suite", [ # "test_suite", [
# ("facebook/opt-125m", "ray", "", "L4"), # (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4"),
# ("facebook/opt-125m", "mp", "", "L4"), # (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4"),
# ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), # (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
# ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), # (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
# ("facebook/opt-125m", "ray", "", "A100"), # (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100"),
# ("facebook/opt-125m", "mp", "", "A100"), # (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100"),
# ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), # (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "FLASHINFER", "A100"),
# ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), # (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
# ]) # ])
# def test_models_distributed( # def test_models_distributed(
# hf_runner, # hf_runner,
...@@ -138,7 +140,7 @@ def test_model_with_failure(vllm_runner) -> None: ...@@ -138,7 +140,7 @@ def test_model_with_failure(vllm_runner) -> None:
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
side_effect=ValueError()): side_effect=ValueError()):
with pytest.raises(ValueError) as exc_info: with pytest.raises(ValueError) as exc_info:
vllm_runner("facebook/opt-125m", vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
dtype="half", dtype="half",
enforce_eager=False, enforce_eager=False,
gpu_memory_utilization=0.7) gpu_memory_utilization=0.7)
......
...@@ -13,10 +13,12 @@ import pytest ...@@ -13,10 +13,12 @@ import pytest
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
MODELS = [ MODELS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
"meta-llama/Llama-2-7b-hf", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
] ]
...@@ -207,7 +209,7 @@ def test_with_prefix_caching( ...@@ -207,7 +209,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching Checks exact match decode with and without prefix caching
with chunked prefill enabled. with chunked prefill enabled.
""" """
model = "meta-llama/Llama-2-7b-chat-hf" model = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf")
# The common prompt has 142 tokens with Llama-2 tokenizer. # The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt = "You are a helpful AI assistant " * 20 common_prompt = "You are a helpful AI assistant " * 20
unique_prompts = [ unique_prompts = [
......
from ..utils import compare_two_settings import os
from ..utils import compare_two_settings, models_path_prefix
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [], compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), [],
["--cpu-offload-gb", "4"]) ["--cpu-offload-gb", "4"])
...@@ -15,9 +15,12 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ...@@ -15,9 +15,12 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
import os
MODELS = [ MODELS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
] ]
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
......
...@@ -6,64 +6,66 @@ from tests.quantization.utils import is_quant_method_supported ...@@ -6,64 +6,66 @@ from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.plugins import set_torch_compile_backend from vllm.plugins import set_torch_compile_backend
from vllm.utils import is_hip from vllm.utils import is_hip
import os
from ..utils import models_path_prefix
TEST_MODELS_SMOKE = [ TEST_MODELS_SMOKE = [
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
"quantization": "compressed-tensors" "quantization": "compressed-tensors"
}), }),
("meta-llama/Meta-Llama-3-8B", {}), (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
] ]
TEST_MODELS = [ TEST_MODELS = [
("facebook/opt-125m", {}), (os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
"dtype": torch.float16, "dtype": torch.float16,
"quantization": "compressed-tensors" "quantization": "compressed-tensors"
}), }),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", { (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
"dtype": torch.float16, "dtype": torch.float16,
"quantization": "fp8" "quantization": "fp8"
}), }),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
"quantization": "compressed-tensors" "quantization": "compressed-tensors"
}), }),
("meta-llama/Meta-Llama-3-8B", {}), (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
] ]
# TODO: enable in pytorch 2.5 # TODO: enable in pytorch 2.5
if False and is_quant_method_supported("aqlm"): # noqa: SIM223 if False and is_quant_method_supported("aqlm"): # noqa: SIM223
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
"quantization": "aqlm" "quantization": "aqlm"
})) }))
# TODO: enable in pytorch 2.5 # TODO: enable in pytorch 2.5
if False and is_quant_method_supported("gguf"): # noqa: SIM223 if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
"quantization": "gguf" "quantization": "gguf"
})) }))
if is_quant_method_supported("gptq"): if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {
"quantization": "gptq" "quantization": "gptq"
})) }))
if is_quant_method_supported("gptq_marlin"): if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
"quantization": "gptq_marlin" "quantization": "gptq_marlin"
})) }))
if is_quant_method_supported("gptq_marlin_24"): if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
"quantization": "gptq_marlin_24" "quantization": "gptq_marlin_24"
})) }))
if is_quant_method_supported("marlin"): if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
"quantization": "marlin" "quantization": "marlin"
})) }))
if not is_hip() and is_quant_method_supported("awq"): if not is_hip() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
"quantization": "AWQ" "quantization": "AWQ"
})) }))
......
...@@ -37,6 +37,7 @@ from vllm.logger import init_logger ...@@ -37,6 +37,7 @@ from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu) identity, is_cpu)
from utils import models_path_prefix
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -875,8 +876,9 @@ def num_gpus_available(): ...@@ -875,8 +876,9 @@ def num_gpus_available():
return cuda_device_count_stateless() return cuda_device_count_stateless()
temp_dir = tempfile.gettempdir() # temp_dir = tempfile.gettempdir()
_dummy_path = os.path.join(temp_dir, "dummy_opt") # _dummy_path = os.path.join(temp_dir, "dummy_opt")
_dummy_path = os.path.join(models_path_prefix, "facebook/opt-125m")
@pytest.fixture @pytest.fixture
......
...@@ -5,13 +5,15 @@ import pytest ...@@ -5,13 +5,15 @@ import pytest
from vllm import SamplingParams from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -89,7 +91,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, ...@@ -89,7 +91,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -156,7 +158,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, ...@@ -156,7 +158,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# Our prompts will generate 128 tokens; since the prompts themselves are # Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128. # small, we don't need much KV space beyond 128.
...@@ -256,7 +258,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ...@@ -256,7 +258,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
[ [
{ {
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -337,7 +339,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, ...@@ -337,7 +339,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -418,7 +420,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( ...@@ -418,7 +420,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -495,7 +497,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, ...@@ -495,7 +497,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
"model": "facebook/opt-125m", "model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test. # skip cuda graph creation for fast test.
"enforce_eager": True, "enforce_eager": True,
......
...@@ -2,13 +2,15 @@ import random ...@@ -2,13 +2,15 @@ import random
from typing import List from typing import List
import pytest import pytest
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from .conftest import get_text_from_llm_generator from .conftest import get_text_from_llm_generator
from ...utils import models_path_prefix
# relatively small model with 4k sliding window # relatively small model with 4k sliding window
MODEL = "bigcode/starcoder2-3b" MODEL = os.path.join(models_path_prefix, "bigcode/starcoder2-3b")
BLOCK_SIZE = 16 BLOCK_SIZE = 16
......
...@@ -13,7 +13,7 @@ from transformers import __version__ as transformers_version ...@@ -13,7 +13,7 @@ from transformers import __version__ as transformers_version
from vllm.logger import init_logger from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
logger = init_logger("test_pipeline_parallel") logger = init_logger("test_pipeline_parallel")
...@@ -24,22 +24,22 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" ...@@ -24,22 +24,22 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, " ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
"MODEL_NAME, DIST_BACKEND"), "MODEL_NAME, DIST_BACKEND"),
[ [
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), (2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), (2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), (1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), (1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), (1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
# NOTE: InternVL2 multi-node tests are flaky, # NOTE: InternVL2 multi-node tests are flaky,
# use mp backend to skip the multi-node tests # use mp backend to skip the multi-node tests
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"), (1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"), "mp"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"), (1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"), "mp"),
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"), (1, 2, 1, 0, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-4B"), "mp"),
(1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp") (1, 2, 0, 1, 0, os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), "mp")
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
......
...@@ -2,15 +2,15 @@ import os ...@@ -2,15 +2,15 @@ import os
import pytest import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"), (2, os.path.join(models_path_prefix, "JackFram/llama-160m")),
]) ])
@pytest.mark.parametrize("ATTN_BACKEND", [ @pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN", "FLASH_ATTN",
"FLASHINFER", # "FLASHINFER",
]) ])
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
......
...@@ -5,6 +5,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. ...@@ -5,6 +5,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import pytest import pytest
import os
from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForSeq2SeqLM
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -12,6 +13,7 @@ from vllm.utils import is_cpu ...@@ -12,6 +13,7 @@ from vllm.utils import is_cpu
from ..conftest import DecoderPromptType from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close from ..models.utils import check_logprobs_close
from ..utils import models_path_prefix
def vllm_to_hf_output( def vllm_to_hf_output(
...@@ -28,7 +30,7 @@ def vllm_to_hf_output( ...@@ -28,7 +30,7 @@ def vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
......
...@@ -3,9 +3,11 @@ import pytest ...@@ -3,9 +3,11 @@ import pytest
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
......
...@@ -8,6 +8,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine ...@@ -8,6 +8,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
class Mock: class Mock:
...@@ -31,7 +33,7 @@ class CustomGPUExecutorAsync(GPUExecutorAsync): ...@@ -31,7 +33,7 @@ class CustomGPUExecutorAsync(GPUExecutorAsync):
return await super().execute_model_async(*args, **kwargs) return await super().execute_model_async(*args, **kwargs)
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor_type_checking(model): def test_custom_executor_type_checking(model):
with pytest.raises(ValueError): with pytest.raises(ValueError):
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
...@@ -47,7 +49,7 @@ def test_custom_executor_type_checking(model): ...@@ -47,7 +49,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args) AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor(model, tmpdir): def test_custom_executor(model, tmpdir):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmpdir)
...@@ -67,7 +69,7 @@ def test_custom_executor(model, tmpdir): ...@@ -67,7 +69,7 @@ def test_custom_executor(model, tmpdir):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor_async(model, tmpdir): def test_custom_executor_async(model, tmpdir):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmpdir)
......
...@@ -2,9 +2,11 @@ import pytest ...@@ -2,9 +2,11 @@ import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text # without optional detokenization, that detokenization includes text
......
...@@ -2,9 +2,11 @@ import pytest ...@@ -2,9 +2,11 @@ import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
......
...@@ -7,11 +7,13 @@ Run `pytest tests/engine/test_stop_reason.py`. ...@@ -7,11 +7,13 @@ Run `pytest tests/engine/test_stop_reason.py`.
""" """
import pytest import pytest
import os
import transformers import transformers
from vllm import SamplingParams from vllm import SamplingParams
from ..utils import models_path_prefix
MODEL = "facebook/opt-350m" MODEL = os.path.join(models_path_prefix, "facebook/opt-350m")
STOP_STR = "." STOP_STR = "."
SEED = 42 SEED = 42
MAX_TOKENS = 1024 MAX_TOKENS = 1024
......
from typing import Any, List, Optional from typing import Any, List, Optional
import pytest import pytest
import os
from vllm import CompletionOutput, LLMEngine, SamplingParams from vllm import CompletionOutput, LLMEngine, SamplingParams
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf" MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200 MAX_TOKENS = 200
IS_ASYNC = False IS_ASYNC = False
......
...@@ -2,12 +2,14 @@ import weakref ...@@ -2,12 +2,14 @@ import weakref
from typing import List from typing import List
import pytest import pytest
import os
from vllm import LLM, EmbeddingRequestOutput, PoolingParams from vllm import LLM, EmbeddingRequestOutput, PoolingParams
from ...conftest import cleanup from ...conftest import cleanup
from ...utils import models_path_prefix
MODEL_NAME = "intfloat/e5-mistral-7b-instruct" MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment