Commit 2e3bfb1e authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] update v1 tests

parent 87d06573
......@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import pytest
import os
from ..utils import models_path_prefix
from transformers import AutoModelForSeq2SeqLM
from vllm.assets.audio import AudioAsset
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
......@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make
# sure things pass through properly.
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
model = "Qwen/Qwen2-Audio-7B-Instruct"
model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts = [
f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501
......
......@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": "gptq",
},
......@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": "marlin",
},
......@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization
{
"speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
"quantization": None,
},
......@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3,
"disable_mqa_scorer": True,
},
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import torch
......@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request
from ...utils import models_path_prefix
# yapf: enable
......@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
])
def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len):
......
......@@ -2,6 +2,7 @@
from typing import Optional
from unittest.mock import Mock
import os
import pytest
import torch
......@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from ...utils import models_path_prefix
EOS_TOKEN_ID = 50256
def create_scheduler(
model: str = "facebook/opt-125m",
model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None,
......@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def test_schedule_multimodal_requests():
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
mm_positions = [[PlaceholderRange(offset=i, length=100)]
for i in range(10)]
requests = create_requests(
......@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget.
"""
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024,
)
mm_positions = [[PlaceholderRange(offset=100, length=600)]
......@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def test_no_mm_input_chunking():
# Disable multimodal input chunking.
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024,
disable_chunked_mm_input=True,
max_model_len=2048,
......@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError):
_ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=100,
disable_chunked_mm_input=True,
)
......@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""
scheduler = create_scheduler(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=1024,
long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching,
......
......@@ -4,11 +4,12 @@ import os
import pytest
from vllm import LLM
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I"
......
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio
......
......@@ -3,6 +3,7 @@
import asyncio
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_forward(self, *args, **kwargs):
......
......@@ -3,6 +3,7 @@
import asyncio
import os
import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
......@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio
......
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import os
import pytest
from tests.utils import wait_for_gpu_memory_to_clear
......@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_method(self, *args, **kwargs):
......@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if model != "meta-llama/Llama-3.2-1B":
if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")
......
# SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly."""
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
_PROMPTS = [
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
......@@ -21,7 +23,7 @@ def test_ngram_max_len(
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=100,
enforce_eager=True, # For faster initialization.
speculative_config={
......@@ -44,11 +46,11 @@ def test_eagle_max_len(
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct",
model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
enforce_eager=True, # For faster initialization.
speculative_config={
"method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": num_speculative_tokens,
},
max_model_len=100,
......
# SPDX-License-Identifier: Apache-2.0
import os
import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
_find_subarray_kmp,
_kmp_lps_array)
from ...utils import models_path_prefix
def test_kmp_lps_array():
......@@ -43,10 +45,10 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m",
model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate",
max_model_len=100,
tokenizer="facebook/opt-125m",
tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto",
dtype="auto",
seed=None,
......
......@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient
from ..utils import models_path_prefix
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
enforce_eager=True,
disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
......
......@@ -7,16 +7,17 @@ import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba
"ibm-ai-platform/Bamba-9B", # hybrid
"BAAI/bge-m3", # embedding
os.path.join(models_path_prefix, "openai/whisper-large-v3"), # transcription
os.path.join(models_path_prefix, "facebook/bart-large-cnn"), # encoder decoder
os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"), # mamba
os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"), # hybrid
os.path.join(models_path_prefix, "BAAI/bge-m3"), # embedding
]
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
......@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput)
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from ...utils import models_path_prefix
@pytest.fixture
......@@ -17,9 +19,9 @@ def model_runner():
max_model_len=512,
)
model_config = ModelConfig(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate",
tokenizer="facebook/opt-125m",
tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment