Commit 2e3bfb1e authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] update v1 tests

parent 87d06573
...@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`. ...@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.
import pytest import pytest
import os import os
from ..utils import models_path_prefix
from transformers import AutoModelForSeq2SeqLM from transformers import AutoModelForSeq2SeqLM
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ..utils import models_path_prefix
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data( ...@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
# correctly. As such, we just need to check one extra modality to make # correctly. As such, we just need to check one extra modality to make
# sure things pass through properly. # sure things pass through properly.
audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate] audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
model = "Qwen/Qwen2-Audio-7B-Instruct" model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>" audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
prompts = [ prompts = [
f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501 f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n" #noqa: E501
...@@ -140,4 +140,4 @@ def test_beam_search_passes_multimodal_data( ...@@ -140,4 +140,4 @@ def test_beam_search_passes_multimodal_data(
assert filtered_hf_output_ids[-1] == eos_token_id assert filtered_hf_output_ids[-1] == eos_token_id
filtered_hf_output_ids = filtered_hf_output_ids[:-1] filtered_hf_output_ids = filtered_hf_output_ids[:-1]
assert filtered_hf_output_ids == filtered_vllm_output_ids assert filtered_hf_output_ids == filtered_vllm_output_ids
\ No newline at end of file
...@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify draft model quantization # Explicitly specify draft model quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": "gptq", "quantization": "gptq",
}, },
...@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Explicitly specify GPTQ-based draft model to use marlin quantization # Explicitly specify GPTQ-based draft model to use marlin quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": "marlin", "quantization": "marlin",
}, },
...@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Not explicitly specify draft model quantization # Not explicitly specify draft model quantization
{ {
"speculative_config": { "speculative_config": {
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"quantization": None, "quantization": None,
}, },
...@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, ...@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{ @pytest.mark.parametrize("test_llm_kwargs", [{
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_mqa_scorer": True, "disable_mqa_scorer": True,
}, },
...@@ -151,4 +151,4 @@ def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -151,4 +151,4 @@ def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
batch_size, batch_size,
max_output_len=output_len, max_output_len=output_len,
seed=seed, seed=seed,
temperature=0.0) temperature=0.0)
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
...@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, ...@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor) KVCacheGroupSpec, KVCacheTensor)
from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request from vllm.v1.request import Request
from ...utils import models_path_prefix
# yapf: enable # yapf: enable
...@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs(): ...@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "max_model_len", "want_estimated_max_len"), [ ("model_id", "max_model_len", "want_estimated_max_len"), [
("Qwen/Qwen1.5-7B", 16385, 16384), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
("Qwen/Qwen1.5-7B", 16383, 16383), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
]) ])
def test_estimate_max_model_len(model_id, max_model_len, def test_estimate_max_model_len(model_id, max_model_len,
want_estimated_max_len): want_estimated_max_len):
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from typing import Optional from typing import Optional
from unittest.mock import Mock from unittest.mock import Mock
import os
import pytest import pytest
import torch import torch
...@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, ...@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from ...utils import models_path_prefix
EOS_TOKEN_ID = 50256 EOS_TOKEN_ID = 50256
def create_scheduler( def create_scheduler(
model: str = "facebook/opt-125m", model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_seqs: int = 16, max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192, max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None, enable_prefix_caching: Optional[bool] = None,
...@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool], ...@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],
def test_schedule_multimodal_requests(): def test_schedule_multimodal_requests():
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf") scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
mm_positions = [[PlaceholderRange(offset=i, length=100)] mm_positions = [[PlaceholderRange(offset=i, length=100)]
for i in range(10)] for i in range(10)]
requests = create_requests( requests = create_requests(
...@@ -243,7 +245,7 @@ def test_schedule_partial_requests(): ...@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
there is insufficient encoder budget. there is insufficient encoder budget.
""" """
scheduler = create_scheduler( scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
) )
mm_positions = [[PlaceholderRange(offset=100, length=600)] mm_positions = [[PlaceholderRange(offset=100, length=600)]
...@@ -303,7 +305,7 @@ def test_schedule_partial_requests(): ...@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
def test_no_mm_input_chunking(): def test_no_mm_input_chunking():
# Disable multimodal input chunking. # Disable multimodal input chunking.
scheduler = create_scheduler( scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
disable_chunked_mm_input=True, disable_chunked_mm_input=True,
max_model_len=2048, max_model_len=2048,
...@@ -347,7 +349,7 @@ def test_no_mm_input_chunking(): ...@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
# of a max_num_batched_tokens for the mm input. # of a max_num_batched_tokens for the mm input.
with pytest.raises(ValueError): with pytest.raises(ValueError):
_ = create_scheduler( _ = create_scheduler(
model="llava-hf/llava-1.5-7b-hf", model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
max_num_batched_tokens=100, max_num_batched_tokens=100,
disable_chunked_mm_input=True, disable_chunked_mm_input=True,
) )
...@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): ...@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
""" """
scheduler = create_scheduler( scheduler = create_scheduler(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=1024, max_num_batched_tokens=1024,
long_prefill_token_threshold=400, long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching, enable_prefix_caching=enable_prefix_caching,
...@@ -1241,4 +1243,4 @@ def test_memory_leak(): ...@@ -1241,4 +1243,4 @@ def test_memory_leak():
scheduler.update_from_output(scheduler_output, model_runner_output) scheduler.update_from_output(scheduler_output, model_runner_output)
# Confirm no memory leak. # Confirm no memory leak.
assert_scheduler_empty(scheduler) assert_scheduler_empty(scheduler)
\ No newline at end of file
...@@ -4,11 +4,12 @@ import os ...@@ -4,11 +4,12 @@ import os
import pytest import pytest
from vllm import LLM from vllm import LLM
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1": if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True) pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
...@@ -26,4 +27,4 @@ def test_concurrent_partial_prefill(model): ...@@ -26,4 +27,4 @@ def test_concurrent_partial_prefill(model):
outputs = model.generate([PROMPT] * 3) outputs = model.generate([PROMPT] * 3)
assert len(outputs) == 3 assert len(outputs) == 3
for output in outputs: for output in outputs:
assert len(output.outputs) == 1 assert len(output.outputs) == 1
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM ...@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_forward(self, *args, **kwargs): def evil_forward(self, *args, **kwargs):
...@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int, ...@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import asyncio import asyncio
import os
import pytest import pytest
from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt ...@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineGenerateError from vllm.v1.engine.exceptions import EngineGenerateError
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None: ...@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None:
generated_tokens.extend(out.outputs[0].token_ids) generated_tokens.extend(out.outputs[0].token_ids)
assert len(generated_tokens) == EXPECTED_TOKENS assert len(generated_tokens) == EXPECTED_TOKENS
async_llm.shutdown() async_llm.shutdown()
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown.""" """Test that we handle a startup Error and shutdown."""
import os
import pytest import pytest
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.utils import cuda_device_count_stateless from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
MODELS = ["meta-llama/Llama-3.2-1B"] MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
def evil_method(self, *args, **kwargs): def evil_method(self, *args, **kwargs):
...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures. Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing. TODO(andy) - LLM without multiprocessing.
""" """
if model != "meta-llama/Llama-3.2-1B": if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B") pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
if cuda_device_count_stateless() < tensor_parallel_size: if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices") pytest.skip(reason="Not enough CUDA devices")
...@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, ...@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
devices=list(range(tensor_parallel_size)), devices=list(range(tensor_parallel_size)),
threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
) )
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly.""" """Test whether spec decoding handles the max model length properly."""
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
_PROMPTS = [ _PROMPTS = [
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
...@@ -21,7 +23,7 @@ def test_ngram_max_len( ...@@ -21,7 +23,7 @@ def test_ngram_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=100, max_model_len=100,
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
...@@ -44,11 +46,11 @@ def test_eagle_max_len( ...@@ -44,11 +46,11 @@ def test_eagle_max_len(
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct", model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
enforce_eager=True, # For faster initialization. enforce_eager=True, # For faster initialization.
speculative_config={ speculative_config={
"method": "eagle", "method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": num_speculative_tokens, "num_speculative_tokens": num_speculative_tokens,
}, },
max_model_len=100, max_model_len=100,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import numpy as np import numpy as np
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer, from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
_find_subarray_kmp, _find_subarray_kmp,
_kmp_lps_array) _kmp_lps_array)
from ...utils import models_path_prefix
def test_kmp_lps_array(): def test_kmp_lps_array():
...@@ -43,10 +45,10 @@ def test_ngram_proposer(): ...@@ -43,10 +45,10 @@ def test_ngram_proposer():
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
# Dummy model config. Just to set max_model_len. # Dummy model config. Just to set max_model_len.
model_config = ModelConfig(model="facebook/opt-125m", model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
max_model_len=100, max_model_len=100,
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
dtype="auto", dtype="auto",
seed=None, seed=None,
...@@ -86,4 +88,4 @@ def test_ngram_proposer(): ...@@ -86,4 +88,4 @@ def test_ngram_proposer():
result = ngram_proposer( result = ngram_proposer(
2, 4, 2, 4,
2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4])) 2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2] assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
\ No newline at end of file
...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform ...@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient from vllm.v1.engine.core_client import DPAsyncMPClient
from ..utils import models_path_prefix
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b", model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
enforce_eager=True, enforce_eager=True,
disable_log_requests=True, disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
...@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind): ...@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind):
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
assert not core_client.engines_running assert not core_client.engines_running
assert not core_client.reqs_in_flight assert not core_client.reqs_in_flight
\ No newline at end of file
...@@ -7,16 +7,17 @@ import vllm.envs as envs ...@@ -7,16 +7,17 @@ import vllm.envs as envs
from vllm import LLM from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [ UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription os.path.join(models_path_prefix, "openai/whisper-large-v3"), # transcription
"facebook/bart-large-cnn", # encoder decoder os.path.join(models_path_prefix, "facebook/bart-large-cnn"), # encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1", # mamba os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"), # mamba
"ibm-ai-platform/Bamba-9B", # hybrid os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"), # hybrid
"BAAI/bge-m3", # embedding os.path.join(models_path_prefix, "BAAI/bge-m3"), # embedding
] ]
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
...@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch): ...@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch):
AsyncLLMEngine._get_executor_cls(vllm_config), AsyncLLMEngine._get_executor_cls(vllm_config),
log_stats=True) log_stats=True)
m.delenv("VLLM_USE_V1") m.delenv("VLLM_USE_V1")
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, ...@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput) SchedulerOutput)
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from ...utils import models_path_prefix
@pytest.fixture @pytest.fixture
...@@ -17,9 +19,9 @@ def model_runner(): ...@@ -17,9 +19,9 @@ def model_runner():
max_model_len=512, max_model_len=512,
) )
model_config = ModelConfig( model_config = ModelConfig(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
task="generate", task="generate",
tokenizer="facebook/opt-125m", tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
...@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner): ...@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner):
assert _is_req_scheduled(model_runner, req_ids[0]) assert _is_req_scheduled(model_runner, req_ids[0])
assert _is_req_added(model_runner, req_ids[1]) assert _is_req_added(model_runner, req_ids[1])
assert not _is_req_scheduled(model_runner, req_ids[1]) assert not _is_req_scheduled(model_runner, req_ids[1])
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment