Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
...@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler): ...@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
return metas, out return metas, out
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_simple():
def test_simple(use_v2_block_manager: bool):
"""Verify basic scheduling works.""" """Verify basic scheduling works."""
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(max_num_batched_tokens,
max_num_batched_tokens, num_seq_group,
num_seq_group, max_model_len,
max_model_len, enable_chunked_prefill=True)
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool): ...@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
assert len(seq_group_meta) == num_seq_group assert len(seq_group_meta) == num_seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_chunk():
def test_chunk(use_v2_block_manager: bool):
"""Verify prefills are chunked properly.""" """Verify prefills are chunked properly."""
block_size = 4 block_size = 4
max_seqs = 60 max_seqs = 60
...@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool): ...@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32 cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32 cache_config.num_gpu_blocks = 32
...@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool): ...@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
assert out.num_batched_tokens == 57 assert out.num_batched_tokens == 57
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_complex():
def test_complex(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 60 max_seqs = 60
max_model_len = 80 max_model_len = 80
...@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool): ...@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 64 cache_config.num_cpu_blocks = 64
cache_config.num_gpu_blocks = 64 cache_config.num_gpu_blocks = 64
...@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool): ...@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
assert running[2].is_prefill() assert running[2].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_maximal_decoding():
def test_maximal_decoding(use_v2_block_manager: bool):
"""Verify decoding requests are prioritized.""" """Verify decoding requests are prioritized."""
block_size = 4 block_size = 4
max_seqs = 2 max_seqs = 2
...@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool): ...@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool): ...@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
assert out.num_batched_tokens == 2 assert out.num_batched_tokens == 2
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prompt_limit():
def test_prompt_limit(use_v2_block_manager: bool):
"""Verify max_num_batched_tokens < max_model_len is possible.""" """Verify max_num_batched_tokens < max_model_len is possible."""
block_size = 4 block_size = 4
max_seqs = 32 max_seqs = 32
...@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool): ...@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16 cache_config.num_gpu_blocks = 16
...@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool): ...@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
assert out.num_batched_tokens == 32 assert out.num_batched_tokens == 32
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prompt_limit_exceed():
def test_prompt_limit_exceed(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 64 max_seqs = 64
max_model_len = 32 max_model_len = 32
...@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): ...@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
assert out.ignored_seq_groups[0] == seq_group assert out.ignored_seq_groups[0] == seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_swap():
def test_swap(use_v2_block_manager: bool):
"""Verify swapping works with chunked prefill requests""" """Verify swapping works with chunked prefill requests"""
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
...@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool): ...@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16 cache_config.num_gpu_blocks = 16
...@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool): ...@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_running_prefill_prioritized_over_swap():
def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
max_model_len = 200 max_model_len = 200
...@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool): ...@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32 cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32 cache_config.num_gpu_blocks = 32
...@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool): ...@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == [] assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_chunked_prefill_preempt():
def test_chunked_prefill_preempt(use_v2_block_manager: bool):
"""Verify preempt works with chunked prefill requests""" """Verify preempt works with chunked prefill requests"""
block_size = 4 block_size = 4
max_seqs = 30 max_seqs = 30
...@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool): ...@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16 cache_config.num_gpu_blocks = 16
...@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool): ...@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
assert out.num_batched_tokens == max_num_batched_tokens assert out.num_batched_tokens == max_num_batched_tokens
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_chunked_prefill_max_seqs():
def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_seqs = 2 max_seqs = 2
max_model_len = 80 max_model_len = 80
...@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool): ...@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 128 cache_config.num_cpu_blocks = 128
cache_config.num_gpu_blocks = 128 cache_config.num_gpu_blocks = 128
...@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool): ...@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
assert not running[1].is_prefill() assert not running[1].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_perfix_caching():
def test_perfix_caching(use_v2_block_manager: bool):
"""Verify allocating full blocks when prefix caching is enabled.""" """Verify allocating full blocks when prefix caching is enabled."""
block_size = 4 block_size = 4
max_seqs = 10 max_seqs = 10
...@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool): ...@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True, enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, cache_config = CacheConfig(block_size,
1.0, 1.0,
1, 1,
......
import pytest
from tests.conftest import VllmRunner
from tests.core.utils import create_dummy_prompt
from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup
MODEL = "JackFram/llama-160m"
def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
scheduler = engine.scheduler[0]
scheduler.add_seq_group(seq_group)
@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_num_computed_tokens_update(num_scheduler_steps: int,
enable_chunked_prefill: bool,
enforce_eager: bool):
is_multi_step = num_scheduler_steps > 1
is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
if is_multi_step_chunked_prefill and current_platform.is_rocm():
pytest.skip("Multi-step with Chunked-Prefill does not support "
"rocm_flash_attn backend")
# Make a vllm engine
runner = VllmRunner(model_name=MODEL,
gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
num_prompt_steps = num_scheduler_steps \
if is_multi_step_chunked_prefill else 1
num_output_tokens_list = [4, 8, 12, 15, 16, 17]
# Create sequence and add to engine
prompt_len = 10
for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
prompt_length=prompt_len,
min_tokens=num_output_tokens,
max_tokens=num_output_tokens)
add_seq_group_to_engine(engine, seq_group)
assert seq.data.get_num_computed_tokens() == 0
for _ in range(num_prompt_steps):
# prompt steps
engine.step()
if not seq.is_finished():
prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
# Test correctness of num_computed_tokens after the prompt steps
assert prompt_num_computed_tokens == \
prompt_len + num_prompt_steps - 1
decode_step_counter = 0
while not seq.is_finished():
# Test correctness of num_computed_tokens after the decode steps
assert seq.data.get_num_computed_tokens(
) == prompt_num_computed_tokens + decode_step_counter
for _ in range(num_scheduler_steps):
# decode step
engine.step()
decode_step_counter += 1
# Test correctness of num_computed_tokens after the sequence finish.
assert seq.data.get_num_computed_tokens(
) == prompt_len + num_output_tokens - 1
...@@ -3,7 +3,7 @@ from collections import deque ...@@ -3,7 +3,7 @@ from collections import deque
from typing import List, Set, Tuple from typing import List, Set, Tuple
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest import pytest # noqa
from torch import Use # noqa from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
...@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group, ...@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens) schedule_and_update_computed_tokens)
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_add_seq_group():
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager) 100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
...@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool): ...@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == i + 1 assert scheduler.get_num_unfinished_seq_groups() == i + 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_abort_seq_group():
def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager) 100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4 cache_config.num_gpu_blocks = 4
...@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool): ...@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 0 assert scheduler.get_num_unfinished_seq_groups() == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_schedule_simple():
def test_scheduler_schedule_simple(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
...@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool): ...@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
64, 64,
num_seq_group, num_seq_group,
max_model_len, max_model_len,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool): ...@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
append_new_token(out, 1) append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_prefill_prioritized():
def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
"""Verify running batched tokens are not applied to prefill requests.""" """Verify running batched tokens are not applied to prefill requests."""
block_size = 4 block_size = 4
max_model_len = 30 max_model_len = 30
...@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool): ...@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
max_batched_num_tokens, max_batched_num_tokens,
2, 2,
max_model_len, max_model_len,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16 cache_config.num_gpu_blocks = 16
...@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool): ...@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
assert get_sequence_groups(out) == [seq_group_b] assert get_sequence_groups(out) == [seq_group_b]
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_schedule_preempt_abort():
def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
block_size = 4 block_size = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager) 64,
2,
max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2 cache_config.num_gpu_blocks = 2
...@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool): ...@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 1 assert scheduler.get_num_unfinished_seq_groups() == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_max_seqs():
def test_scheduler_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_seq_group = 2 max_seq_group = 2
...@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool): ...@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
64, 64,
max_seq_group, max_seq_group,
max_model_len, max_model_len,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool): ...@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_delay_factor():
def test_scheduler_delay_factor(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, 100,
64, 64,
16, 16,
delay_factor=0.5, delay_factor=0.5,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8 cache_config.num_gpu_blocks = 8
...@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool): ...@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
append_new_token(out, 1) append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_swapped_out_prioritized():
def test_swapped_out_prioritized(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(max_num_seqs=6, scheduler = initialize_scheduler(max_num_seqs=6,
block_size=block_size, block_size=block_size,
use_v2_block_manager=use_v2_block_manager,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
# best_of=2 * 3 == 6 sequences. # best_of=2 * 3 == 6 sequences.
...@@ -344,7 +344,6 @@ def initialize_scheduler( ...@@ -344,7 +344,6 @@ def initialize_scheduler(
max_token_budget=1000, max_token_budget=1000,
max_model_len=1000, max_model_len=1000,
lora_config=None, lora_config=None,
use_v2_block_manager=False,
block_size=4, block_size=4,
num_cpu_blocks=8, num_cpu_blocks=8,
num_gpu_blocks=8, num_gpu_blocks=8,
...@@ -354,7 +353,7 @@ def initialize_scheduler( ...@@ -354,7 +353,7 @@ def initialize_scheduler(
max_token_budget, max_token_budget,
max_num_seqs, max_num_seqs,
max_model_len, max_model_len,
use_v2_block_manager=use_v2_block_manager) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_cpu_blocks = num_cpu_blocks
cache_config.num_gpu_blocks = num_gpu_blocks cache_config.num_gpu_blocks = num_gpu_blocks
...@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget, ...@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs) budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prefill_schedule_max_prompt_len():
def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
""" """
Test prompt longer than max_prompt_len is aborted. Test prompt longer than max_prompt_len is aborted.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(max_model_len=30, scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
_, seq_group = create_dummy_prompt("0", _, seq_group = create_dummy_prompt("0",
prompt_length=60, prompt_length=60,
block_size=block_size) block_size=block_size)
...@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool): ...@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prefill_schedule_token_budget():
def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
""" """
Test token budget respected. Test token budget respected.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
budget = create_token_budget(token_budget=0) budget = create_token_budget(token_budget=0)
...@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool): ...@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
# Test when current_batched_tokens respected. # Test when current_batched_tokens respected.
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=16, num_cpu_blocks=16,
num_gpu_blocks=16) num_gpu_blocks=16)
budget = create_token_budget(token_budget=60) budget = create_token_budget(token_budget=60)
...@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool): ...@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prefill_schedule_max_seqs():
def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
""" """
Test max seq respected. Test max seq respected.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
budget = create_token_budget(max_num_seqs=2) budget = create_token_budget(max_num_seqs=2)
...@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool): ...@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1 assert len(remaining_waiting) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prefill_schedule_max_lora():
def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
""" """
Test max lora is respected and prioritized. Test max lora is respected and prioritized.
""" """
block_size = 4 block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config, scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size, block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
...@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool): ...@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
assert budget.num_batched_tokens == 60 assert budget.num_batched_tokens == 60
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_prefill_schedule_no_block_manager_capacity():
def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
""" """
Test sequence cannot be scheduled due to block manager has no capacity. Test sequence cannot be scheduled due to block manager has no capacity.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_gpu_blocks=128, num_gpu_blocks=128,
num_cpu_blocks=128) num_cpu_blocks=128)
budget = create_token_budget() budget = create_token_budget()
...@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager): ...@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
assert len(remaining_waiting) == 0 assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_decode_schedule_preempted():
def test_decode_schedule_preempted(use_v2_block_manager: bool):
""" """
Test decodes cannot be scheduled and preempted. Test decodes cannot be scheduled and preempted.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
curr_loras = None curr_loras = None
...@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool): ...@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_decode_swap_beam_search():
def test_decode_swap_beam_search(use_v2_block_manager: bool):
""" """
Test best_of > 1 swap out blocks Test best_of > 1 swap out blocks
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_gpu_blocks=64, num_gpu_blocks=64,
num_cpu_blocks=64) num_cpu_blocks=64)
curr_loras = None curr_loras = None
...@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool): ...@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
assert output.blocks_to_copy == [] assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_decode_blocks_to_copy_update():
def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
""" """
Verify blocks_to_copy is updated. Verify blocks_to_copy is updated.
""" """
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=4,
block_size=4,
num_cpu_blocks=16, num_cpu_blocks=16,
num_gpu_blocks=16) num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1", _, seq_group = create_dummy_prompt("1",
...@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool): ...@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
assert output.blocks_to_copy == [(2, 3)] assert output.blocks_to_copy == [(2, 3)]
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_simple():
def test_schedule_swapped_simple(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size)
block_size=block_size)
curr_loras = None curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = [] blocks_to_swap_out: List[Tuple[int, int]] = []
_, seq_group = create_dummy_prompt("1", _, seq_group = create_dummy_prompt("1",
...@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool): ...@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
assert blocks_to_swap_out == blocks_to_swap_in_reverse assert blocks_to_swap_out == blocks_to_swap_in_reverse
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_max_token_budget():
def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
curr_loras = None curr_loras = None
...@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool): ...@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_max_seqs():
def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
curr_loras = None curr_loras = None
...@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool): ...@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_max_loras():
def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
block_size = 4 block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config, scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size, block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
...@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool): ...@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
assert len(curr_loras) == 1 assert len(curr_loras) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_cannot_swap_in():
def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
curr_loras = None curr_loras = None
...@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool): ...@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_infeasible_swap():
def test_infeasible_swap(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
curr_loras = None curr_loras = None
...@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool): ...@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0 assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_schedule_swapped_blocks_to_copy():
def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
block_size = 4 block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, scheduler = initialize_scheduler(block_size=block_size,
block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
curr_loras = None curr_loras = None
......
...@@ -13,9 +13,10 @@ def create_dummy_prompt( ...@@ -13,9 +13,10 @@ def create_dummy_prompt(
prompt_length: int, prompt_length: int,
block_size: Optional[int] = None, block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1, best_of: int = 1,
prompt_tokens: Optional[List[int]] = None, prompt_tokens: Optional[List[int]] = None,
min_tokens: int = 0,
max_tokens: int = 16,
) -> Tuple[Sequence, SequenceGroup]: ) -> Tuple[Sequence, SequenceGroup]:
if not block_size: if not block_size:
block_size = prompt_length block_size = prompt_length
...@@ -35,8 +36,9 @@ def create_dummy_prompt( ...@@ -35,8 +36,9 @@ def create_dummy_prompt(
seqs=[prompt], seqs=[prompt],
arrival_time=time.time(), arrival_time=time.time(),
sampling_params=SamplingParams( sampling_params=SamplingParams(
use_beam_search=use_beam_search, best_of=best_of,
best_of=best_of), max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request) lora_request=lora_request)
return prompt, seq_group return prompt, seq_group
...@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder( ...@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length: int, encoder_prompt_length: int,
block_size: Optional[int] = None, block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1, best_of: int = 1,
) -> Tuple[Sequence, Sequence, SequenceGroup]: ) -> Tuple[Sequence, Sequence, SequenceGroup]:
if not block_size: if not block_size:
...@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder( ...@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
from_decoder_prompt=False) from_decoder_prompt=False)
seq_group = SequenceGroup(request_id=request_id, seq_group = SequenceGroup(request_id=request_id,
seqs=[decoder_prompt], seqs=[decoder_prompt],
sampling_params=SamplingParams( sampling_params=SamplingParams(best_of=best_of),
use_beam_search=use_beam_search,
best_of=best_of),
arrival_time=time.time(), arrival_time=time.time(),
lora_request=lora_request, lora_request=lora_request,
encoder_seq=encoder_prompt) encoder_seq=encoder_prompt)
......
port: 12312 port: 12312
served_model_name: mymodel
tensor_parallel_size: 2 tensor_parallel_size: 2
...@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node ...@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
to fail. to fail.
""" """
import os import os
from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional
import pytest import pytest
from packaging import version
from transformers import __version__ as transformers_version
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel") ...@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.mark.parametrize( class ParallelSetup(NamedTuple):
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, " tp_size: int
"MODEL_NAME, DIST_BACKEND"), pp_size: int
[ eager_mode: bool
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), chunked_prefill: bool
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), @dataclass
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), class PPTestSettings:
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), parallel_setups: List[ParallelSetup]
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), distributed_backends: List[str]
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), trust_remote_code: bool
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), tokenizer_mode: Optional[str]
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
# NOTE: InternVL2 multi-node tests are flaky, @staticmethod
# use mp backend to skip the multi-node tests def detailed(
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"), *,
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"), tp_base: int = 1,
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"), pp_base: int = 2,
(1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp") trust_remote_code: bool = False,
], tokenizer_mode: Optional[str] = None,
) ):
@fork_new_process_for_each_test return PPTestSettings(
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, parallel_setups=[
TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND): ParallelSetup(tp_size=tp_base,
if VLLM_MULTI_NODE and DIST_BACKEND == "mp": pp_size=pp_base,
eager_mode=False,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=True,
chunked_prefill=False),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp", "ray"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
@staticmethod
def fast(
*,
tp_base: int = 1,
pp_base: int = 2,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
def iter_params(self, model_name: str):
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
GENERATION_MODEL_SETTINGS = {
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"bigscience/bloomz-1b1": PPTestSettings.fast(),
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-2b": PPTestSettings.fast(),
"google/gemma-2-9b": PPTestSettings.fast(),
"gpt2": PPTestSettings.fast(),
"bigcode/starcoder": PPTestSettings.fast(),
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
"EleutherAI/pythia-12b": PPTestSettings.fast(),
"ibm/PowerLM-3b": PPTestSettings.fast(),
"ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
"core42/jais-13b-chat": PPTestSettings.fast(),
# TODO: Implement PP
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
# FIXME: Cannot load tokenizer in latest transformers version
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}
EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
}
MULTIMODAL_MODEL_SETTINGS = {
# [FAST TESTS]
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}
CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B",
"ibm/PowerLM-3b",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B",
"microsoft/Phi-3-vision-128k-instruct",
"fixie-ai/ultravox-v0_3",
]
def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available: int,
*,
method: Literal["generate", "encode"] = "encode",
):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for " pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend") "multiprocessing distributed backend")
# Skip tests that require transformers>=4.45.0 common_args = [
if "Qwen2-VL" in MODEL_NAME and version.parse(
transformers_version) < version.parse("4.45.0.dev0"):
pytest.skip("This test requires transformers>=4.45.0")
pp_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"float16", "float16",
"--max-model-len", "--max-model-len",
"8192", "2048",
"--max-num-seqs",
"8",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
and chunked_prefill):
# Test Ray ADAG for a subset of the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
common_args.append("--disable-frontend-multiprocessing")
else:
pp_env = None
pp_args = [
*common_args,
"--pipeline-parallel-size", "--pipeline-parallel-size",
str(PP_SIZE), str(pp_size),
"--tensor-parallel-size", "--tensor-parallel-size",
str(TP_SIZE), str(tp_size),
"--distributed-executor-backend", "--distributed-executor-backend",
DIST_BACKEND, distributed_backend,
] ]
# compare without pipeline parallelism # compare without pipeline parallelism
...@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, ...@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
# schedule all workers in a node other than the head node, # schedule all workers in a node other than the head node,
# which can cause the test to fail. # which can cause the test to fail.
tp_args = [ tp_args = [
# use half precision for speed and memory savings in CI environment *common_args,
"--dtype",
"float16",
"--max-model-len",
"8192",
"--tensor-parallel-size", "--tensor-parallel-size",
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI. str(tp_size),
"--distributed-executor-backend", "--distributed-executor-backend",
"mp", "mp",
] ]
if CHUNKED_PREFILL:
pp_args.append("--enable-chunked-prefill")
tp_args.append("--enable-chunked-prefill")
if EAGER_MODE:
pp_args.append("--enforce-eager")
tp_args.append("--enforce-eager")
if TRUST_REMOTE_CODE:
pp_args.append("--trust-remote-code")
tp_args.append("--trust-remote-code")
pp_env = None
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
and CHUNKED_PREFILL):
# Test Ray ADAG for a subset of the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
pp_args.append("--disable-frontend-multiprocessing")
tp_args.append("--disable-frontend-multiprocessing")
try: try:
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env) compare_two_settings(model_name,
pp_args,
tp_args,
pp_env,
method=method)
except Exception: except Exception:
if pp_env is None: if pp_env is None:
raise raise
else: else:
# Ray ADAG tests are flaky, so we don't want to fail the test # Ray ADAG tests are flaky, so we don't want to fail the test
logger.exception("Ray ADAG tests failed") logger.exception("Ray ADAG tests failed")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="encode")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
...@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg): ...@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg):
nullable_kvs(arg) nullable_kvs(arg)
@pytest.mark.parametrize(("arg", "expected"), [ # yapf: disable
(None, None), @pytest.mark.parametrize(("arg", "expected", "option"), [
("{}", {}), (None, None, "mm-processor-kwargs"),
('{"num_crops": 4}', { ("{}", {}, "mm-processor-kwargs"),
"num_crops": 4 (
}), '{"num_crops": 4}',
('{"foo": {"bar": "baz"}}', { {
"foo": { "num_crops": 4
"bar": "baz" },
} "mm-processor-kwargs"
}), ),
(
'{"foo": {"bar": "baz"}}',
{
"foo":
{
"bar": "baz"
}
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
{
"cast_logits_dtype": "bfloat16",
"sequence_parallel_norm": True,
"sequence_parallel_norm_threshold": 2048,
},
"override-neuron-config"
),
]) ])
def test_mm_processor_kwargs_prompt_parser(arg, expected): # yapf: enable
def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None: if arg is None:
args = parser.parse_args([]) args = parser.parse_args([])
else: else:
args = parser.parse_args(["--mm-processor-kwargs", arg]) args = parser.parse_args([f"--{option}", arg])
assert args.mm_processor_kwargs == expected assert getattr(args, option.replace("-", "_")) == expected
...@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model): ...@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
...@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir): ...@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmpdir) os.chdir(tmp_path)
try: try:
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
......
...@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput], ...@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
v2_output = llm.encode(prompt, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, ...@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output) assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode(
[{
"prompt": p
} for p in PROMPTS],
pooling_params=pooling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams() pooling_params = PoolingParams()
......
...@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): ...@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=prompt,
sampling_params=sampling_params)
v2_output = llm.generate(prompt, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate({"prompt": prompt},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, ...@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output) assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=PROMPTS,
sampling_params=sampling_params)
v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate(
[{
"prompt": p
} for p in PROMPTS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0) sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
......
...@@ -7,7 +7,7 @@ import pytest ...@@ -7,7 +7,7 @@ import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...conftest import cleanup from ...conftest import cleanup
...@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm): ...@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
) guided_decoding=GuidedDecodingParams(regex=sample_regex))
outputs = llm.generate( outputs = llm.generate(prompts=[
prompts=[ f"Give an example IPv4 address with this regex: {sample_regex}"
f"Give an example IPv4 address with this regex: {sample_regex}" ] * 2,
] * 2, sampling_params=sampling_params,
sampling_params=sampling_params, use_tqdm=True)
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm): ...@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=1.0, temperature=1.0,
max_tokens=1000, max_tokens=1000,
) guided_decoding=GuidedDecodingParams(json=sample_json_schema))
outputs = llm.generate( outputs = llm.generate(prompts=[
prompts=[ f"Give an example JSON for an employee profile "
f"Give an example JSON for an employee profile " f"that fits this schema: {sample_json_schema}"
f"that fits this schema: {sample_json_schema}" ] * 2,
] * 2, sampling_params=sampling_params,
sampling_params=sampling_params, use_tqdm=True)
use_tqdm=True,
guided_options_request=dict(guided_json=sample_json_schema))
assert outputs is not None assert outputs is not None
...@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm): ...@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
) guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
outputs = llm.generate( outputs = llm.generate(
prompts="The best language for type-safe systems programming is ", prompts="The best language for type-safe systems programming is ",
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=True, use_tqdm=True)
guided_options_request=dict(guided_choice=sample_guided_choice))
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm): ...@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature=0.8, temperature=0.8,
top_p=0.95, top_p=0.95,
max_tokens=1000, max_tokens=1000,
) guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
outputs = llm.generate( outputs = llm.generate(
prompts=("Generate a sql state that select col_1 from " prompts=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"), "table_1 where it is equals to 1"),
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=True, use_tqdm=True,
guided_options_request=dict(guided_grammar=sample_sql_statements)) )
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
...@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm): ...@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
assert generated_text.strip() == ground_truth assert generated_text.strip() == ground_truth
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
def test_guided_options_request_deprecation_warning(sample_regex, llm):
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
with pytest.warns(DeprecationWarning, match="guided_options_request"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
@pytest.mark.skip_global_cleanup
def test_validation_against_both_guided_decoding_options(sample_regex, llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
with pytest.raises(ValueError, match="Cannot set both"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
...@@ -21,7 +21,9 @@ def server(): ...@@ -21,7 +21,9 @@ def server():
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--max-model-len", "--max-model-len",
"4096", "2048",
"--max-num-seqs",
"5",
"--enforce-eager", "--enforce-eager",
] ]
......
from http import HTTPStatus from http import HTTPStatus
from typing import List
import openai import openai
import pytest import pytest
...@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer ...@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if not hasattr(request, "param"):
return []
val = request.param
if isinstance(val, str):
return [val]
return request.param
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server(server_args):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
...@@ -23,6 +60,7 @@ def server(): ...@@ -23,6 +60,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--max-num-seqs", "--max-num-seqs",
"128", "128",
*server_args,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -35,6 +73,15 @@ async def client(server): ...@@ -35,6 +73,15 @@ async def client(server):
yield async_client yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_show_version(client: openai.AsyncOpenAI): async def test_show_version(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/") base_url = str(client.base_url)[:-3].strip("/")
...@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI): ...@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
assert response.json() == {"version": VLLM_VERSION} assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(client: openai.AsyncOpenAI): async def test_check_health(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/") base_url = str(client.base_url)[:-3].strip("/")
......
...@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0, temperature=0.0,
stream=True, stream=True,
stream_options={ stream_options={
"include_usage": True, "include_usage": True,
"continuous_usage_stats": True "continuous_usage_stats": True,
}, },
) )
last_completion_tokens = 0
async for chunk in stream: async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0 assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0 assert last_completion_tokens == 0 or \
chunk.usage.completion_tokens > last_completion_tokens or \
(
not chunk.choices and
chunk.usage.completion_tokens == last_completion_tokens
)
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens) chunk.usage.completion_tokens)
last_completion_tokens = chunk.usage.completion_tokens
assert last_completion_tokens == 10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat` # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
......
...@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists() ...@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs # Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [ MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
...@@ -20,12 +20,20 @@ Hi there!<|im_end|> ...@@ -20,12 +20,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|> What is the capital of<|im_end|>
<|im_start|>assistant <|im_start|>assistant
"""), """),
("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user ("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
<|im_start|>user <|im_start|>user
What is the capital of""") What is the capital of"""),
("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""),
] ]
TEST_MESSAGES = [ TEST_MESSAGES = [
...@@ -42,6 +50,10 @@ TEST_MESSAGES = [ ...@@ -42,6 +50,10 @@ TEST_MESSAGES = [
'content': 'What is the capital of' 'content': 'What is the capital of'
}, },
] ]
ASSISTANT_MESSAGE_TO_CONTINUE = {
'role': 'assistant',
'content': 'The capital of'
}
def test_load_chat_template(): def test_load_chat_template():
...@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike(): ...@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,template,add_generation_prompt,expected_output", "model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATON_OUTPUT) MODEL_TEMPLATE_GENERATON_OUTPUT)
def test_get_gen_prompt(model, template, add_generation_prompt, def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output): continue_final_message, expected_output):
# Initialize the tokenizer # Initialize the tokenizer
tokenizer = get_tokenizer(tokenizer_name=model) tokenizer = get_tokenizer(tokenizer_name=model)
template_content = load_chat_template(chat_template=template) template_content = load_chat_template(chat_template=template)
...@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments # Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest( mock_request = ChatCompletionRequest(
model=model, model=model,
messages=TEST_MESSAGES, messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
add_generation_prompt=add_generation_prompt) if continue_final_message else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result # Call the function and get the result
result = apply_hf_chat_template( result = apply_hf_chat_template(
...@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation=mock_request.messages, conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content, chat_template=mock_request.chat_template or template_content,
add_generation_prompt=mock_request.add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
) )
# Test assertion # Test assertion
......
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--max-num-seqs",
"128",
"--enable-chunked-prefill",
"--max-num-batched-tokens",
"1000",
# large prompts create a lot of output
"--disable-log-requests",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
prompt = "What is the capital of France?" * 400
stream = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=5,
)
tokens_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
tokens_received += 1
assert chunk.choices[0].text
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
@pytest.mark.asyncio
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is the capital of France?" * 400
}]
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=True,
top_logprobs=5,
)
tokens_received = 0
empty_chunks_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
if chunk.choices[0].delta.content == "":
# when there is no tokens generated
assert chunk.usage.completion_tokens == 0
assert chunk.choices[0].logprobs is None
empty_chunks_received += 1
else:
tokens_received += 1
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
assert empty_chunks_received <= 1
import json import json
import unittest
from vllm.entrypoints.openai.cli_args import make_arg_parser import pytest
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args)
from vllm.entrypoints.openai.serving_engine import LoRAModulePath from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from ...utils import VLLM_PATH
LORA_MODULE = { LORA_MODULE = {
"name": "module2", "name": "module2",
"path": "/path/to/module2", "path": "/path/to/module2",
"base_model_name": "llama" "base_model_name": "llama"
} }
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
assert CHATML_JINJA_PATH.exists()
class TestLoraParserAction(unittest.TestCase): @pytest.fixture
def serve_parser():
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
return make_arg_parser(parser)
def setUp(self):
# Setting up argparse parser for tests
parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
self.parser = make_arg_parser(parser)
def test_valid_key_value_format(self): ### Tests for Lora module parsing
# Test old format: name=path def test_valid_key_value_format(serve_parser):
args = self.parser.parse_args([ # Test old format: name=path
'--lora-modules', args = serve_parser.parse_args([
'module1=/path/to/module1', '--lora-modules',
'module1=/path/to/module1',
])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
assert args.lora_modules == expected
def test_valid_json_format(serve_parser):
# Test valid JSON format input
args = serve_parser.parse_args([
'--lora-modules',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
def test_invalid_json_format(serve_parser):
# Test invalid JSON format input, missing closing brace
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
]) ])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
self.assertEqual(args.lora_modules, expected)
def test_valid_json_format(self):
# Test valid JSON format input def test_invalid_type_error(serve_parser):
args = self.parser.parse_args([ # Test type error when values are not JSON or key=value
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules', '--lora-modules',
json.dumps(LORA_MODULE), 'invalid_format' # This is not JSON or key=value format
]) ])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2', def test_invalid_json_field(serve_parser):
base_model_name='llama') # Test valid JSON format but missing required fields
] with pytest.raises(SystemExit):
self.assertEqual(args.lora_modules, expected) serve_parser.parse_args([
def test_invalid_json_format(self):
# Test invalid JSON format input, missing closing brace
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module3", "path": "/path/to/module3"'
])
def test_invalid_type_error(self):
# Test type error when values are not JSON or key=value
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'invalid_format' # This is not JSON or key=value format
])
def test_invalid_json_field(self):
# Test valid JSON format but missing required fields
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module4"}' # Missing required 'path' field
])
def test_empty_values(self):
# Test when no LoRA modules are provided
args = self.parser.parse_args(['--lora-modules', ''])
self.assertEqual(args.lora_modules, [])
def test_multiple_valid_inputs(self):
# Test multiple valid inputs (both old and JSON format)
args = self.parser.parse_args([
'--lora-modules', '--lora-modules',
'module1=/path/to/module1', '{"name": "module4"}' # Missing required 'path' field
json.dumps(LORA_MODULE),
]) ])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
if __name__ == '__main__': def test_empty_values(serve_parser):
unittest.main() # Test when no LoRA modules are provided
args = serve_parser.parse_args(['--lora-modules', ''])
assert args.lora_modules == []
def test_multiple_valid_inputs(serve_parser):
# Test multiple valid inputs (both old and JSON format)
args = serve_parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
### Tests for serve argument validation that run prior to loading
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
"""Ensure validation passes with tool choice enabled with a call parser"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--tool-call-parser",
"mistral",
])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(
args=["--chat-template",
CHATML_JINJA_PATH.absolute().as_posix()])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_sad_paths(serve_parser):
"""Ensure validation fails if the chat template doesn't exist"""
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
with pytest.raises(ValueError):
validate_parsed_serve_args(args)
...@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): ...@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
extra_body=dict( extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary # NOTE: this has to be true for n > 1 in vLLM, but
# for official client. # not necessary for official client.
use_beam_search=True), use_beam_search=True),
) )
assert len(batch.choices) == 4 assert len(batch.choices) == 4
......
...@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
0].embedding 0].embedding
assert responses_float.data[1].embedding == responses_default.data[ assert responses_float.data[1].embedding == responses_default.data[
1].embedding 1].embedding
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding_truncation(
embedding_client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding_truncation_invalid(
embedding_client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message
...@@ -70,7 +70,6 @@ EXPECTED_VALUES = { ...@@ -70,7 +70,6 @@ EXPECTED_VALUES = {
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)], ("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total", "vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": "vllm:generation_tokens":
...@@ -151,9 +150,6 @@ EXPECTED_METRICS = [ ...@@ -151,9 +150,6 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum", "vllm:request_params_n_sum",
"vllm:request_params_n_bucket", "vllm:request_params_n_bucket",
"vllm:request_params_n_count", "vllm:request_params_n_count",
"vllm:request_params_best_of_sum",
"vllm:request_params_best_of_bucket",
"vllm:request_params_best_of_count",
"vllm:num_preemptions_total", "vllm:num_preemptions_total",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment