Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
......@@ -27,19 +27,16 @@ def schedule_and_update_computed_tokens(scheduler):
return metas, out
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_simple(use_v2_block_manager: bool):
def test_simple():
"""Verify basic scheduling works."""
block_size = 4
num_seq_group = 4
max_model_len = 16
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
scheduler_config = SchedulerConfig(max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -74,8 +71,7 @@ def test_simple(use_v2_block_manager: bool):
assert len(seq_group_meta) == num_seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunk(use_v2_block_manager: bool):
def test_chunk():
"""Verify prefills are chunked properly."""
block_size = 4
max_seqs = 60
......@@ -86,7 +82,7 @@ def test_chunk(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32
......@@ -124,8 +120,7 @@ def test_chunk(use_v2_block_manager: bool):
assert out.num_batched_tokens == 57
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_complex(use_v2_block_manager: bool):
def test_complex():
block_size = 4
max_seqs = 60
max_model_len = 80
......@@ -135,7 +130,7 @@ def test_complex(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 64
cache_config.num_gpu_blocks = 64
......@@ -194,8 +189,7 @@ def test_complex(use_v2_block_manager: bool):
assert running[2].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_maximal_decoding(use_v2_block_manager: bool):
def test_maximal_decoding():
"""Verify decoding requests are prioritized."""
block_size = 4
max_seqs = 2
......@@ -206,7 +200,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -288,8 +282,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
assert out.num_batched_tokens == 2
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit(use_v2_block_manager: bool):
def test_prompt_limit():
"""Verify max_num_batched_tokens < max_model_len is possible."""
block_size = 4
max_seqs = 32
......@@ -300,7 +293,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -323,8 +316,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
assert out.num_batched_tokens == 32
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prompt_limit_exceed(use_v2_block_manager: bool):
def test_prompt_limit_exceed():
block_size = 4
max_seqs = 64
max_model_len = 32
......@@ -349,8 +341,7 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
assert out.ignored_seq_groups[0] == seq_group
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_swap(use_v2_block_manager: bool):
def test_swap():
"""Verify swapping works with chunked prefill requests"""
block_size = 4
max_seqs = 30
......@@ -361,7 +352,7 @@ def test_swap(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -407,8 +398,7 @@ def test_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
def test_running_prefill_prioritized_over_swap():
block_size = 4
max_seqs = 30
max_model_len = 200
......@@ -418,7 +408,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 32
cache_config.num_gpu_blocks = 32
......@@ -501,8 +491,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
assert out.blocks_to_swap_out == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_preempt(use_v2_block_manager: bool):
def test_chunked_prefill_preempt():
"""Verify preempt works with chunked prefill requests"""
block_size = 4
max_seqs = 30
......@@ -513,7 +502,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -568,8 +557,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
assert out.num_batched_tokens == max_num_batched_tokens
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
def test_chunked_prefill_max_seqs():
block_size = 4
max_seqs = 2
max_model_len = 80
......@@ -579,7 +567,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 128
cache_config.num_gpu_blocks = 128
......@@ -622,8 +610,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
assert not running[1].is_prefill()
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_perfix_caching(use_v2_block_manager: bool):
def test_perfix_caching():
"""Verify allocating full blocks when prefix caching is enabled."""
block_size = 4
max_seqs = 10
......@@ -634,7 +621,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
max_seqs,
max_model_len,
enable_chunked_prefill=True,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size,
1.0,
1,
......
import pytest
from tests.conftest import VllmRunner
from tests.core.utils import create_dummy_prompt
from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup
MODEL = "JackFram/llama-160m"
def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
scheduler = engine.scheduler[0]
scheduler.add_seq_group(seq_group)
@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
@pytest.mark.parametrize("enforce_eager", [False, True])
def test_num_computed_tokens_update(num_scheduler_steps: int,
enable_chunked_prefill: bool,
enforce_eager: bool):
is_multi_step = num_scheduler_steps > 1
is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
if is_multi_step_chunked_prefill and current_platform.is_rocm():
pytest.skip("Multi-step with Chunked-Prefill does not support "
"rocm_flash_attn backend")
# Make a vllm engine
runner = VllmRunner(model_name=MODEL,
gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
num_prompt_steps = num_scheduler_steps \
if is_multi_step_chunked_prefill else 1
num_output_tokens_list = [4, 8, 12, 15, 16, 17]
# Create sequence and add to engine
prompt_len = 10
for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
prompt_length=prompt_len,
min_tokens=num_output_tokens,
max_tokens=num_output_tokens)
add_seq_group_to_engine(engine, seq_group)
assert seq.data.get_num_computed_tokens() == 0
for _ in range(num_prompt_steps):
# prompt steps
engine.step()
if not seq.is_finished():
prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
# Test correctness of num_computed_tokens after the prompt steps
assert prompt_num_computed_tokens == \
prompt_len + num_prompt_steps - 1
decode_step_counter = 0
while not seq.is_finished():
# Test correctness of num_computed_tokens after the decode steps
assert seq.data.get_num_computed_tokens(
) == prompt_num_computed_tokens + decode_step_counter
for _ in range(num_scheduler_steps):
# decode step
engine.step()
decode_step_counter += 1
# Test correctness of num_computed_tokens after the sequence finish.
assert seq.data.get_num_computed_tokens(
) == prompt_len + num_output_tokens - 1
......@@ -3,7 +3,7 @@ from collections import deque
from typing import List, Set, Tuple
from unittest.mock import MagicMock
import pytest
import pytest # noqa
from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
......@@ -17,11 +17,13 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
......@@ -37,11 +39,13 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == i + 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100, 64, 1, use_v2_block_manager=use_v2_block_manager)
100,
64,
1,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
cache_config.num_gpu_blocks = 4
......@@ -61,8 +65,7 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_simple(use_v2_block_manager: bool):
def test_scheduler_schedule_simple():
block_size = 4
num_seq_group = 4
max_model_len = 16
......@@ -70,7 +73,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
64,
num_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -105,8 +108,7 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
def test_scheduler_prefill_prioritized():
"""Verify running batched tokens are not applied to prefill requests."""
block_size = 4
max_model_len = 30
......@@ -115,7 +117,7 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
max_batched_num_tokens,
2,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
......@@ -139,12 +141,14 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
assert get_sequence_groups(out) == [seq_group_b]
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
64,
2,
max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
cache_config.num_gpu_blocks = 2
......@@ -194,8 +198,7 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
assert scheduler.get_num_unfinished_seq_groups() == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_max_seqs(use_v2_block_manager: bool):
def test_scheduler_max_seqs():
block_size = 4
num_seq_group = 4
max_seq_group = 2
......@@ -204,7 +207,7 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
64,
max_seq_group,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -242,15 +245,14 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_delay_factor(use_v2_block_manager: bool):
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
16,
delay_factor=0.5,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
......@@ -287,12 +289,10 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool):
append_new_token(out, 1)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_swapped_out_prioritized(use_v2_block_manager: bool):
def test_swapped_out_prioritized():
block_size = 4
scheduler = initialize_scheduler(max_num_seqs=6,
block_size=block_size,
use_v2_block_manager=use_v2_block_manager,
num_cpu_blocks=64,
num_gpu_blocks=64)
# best_of=2 * 3 == 6 sequences.
......@@ -344,7 +344,6 @@ def initialize_scheduler(
max_token_budget=1000,
max_model_len=1000,
lora_config=None,
use_v2_block_manager=False,
block_size=4,
num_cpu_blocks=8,
num_gpu_blocks=8,
......@@ -354,7 +353,7 @@ def initialize_scheduler(
max_token_budget,
max_num_seqs,
max_model_len,
use_v2_block_manager=use_v2_block_manager)
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks
cache_config.num_gpu_blocks = num_gpu_blocks
......@@ -379,15 +378,12 @@ def add_token_budget(budget: SchedulingBudget,
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
def test_prefill_schedule_max_prompt_len():
"""
Test prompt longer than max_prompt_len is aborted.
"""
block_size = 4
scheduler = initialize_scheduler(max_model_len=30,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
_, seq_group = create_dummy_prompt("0",
prompt_length=60,
block_size=block_size)
......@@ -402,14 +398,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
def test_prefill_schedule_token_budget():
"""
Test token budget respected.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(token_budget=0)
......@@ -439,8 +433,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1
# Test when current_batched_tokens respected.
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=16,
num_gpu_blocks=16)
budget = create_token_budget(token_budget=60)
......@@ -467,14 +460,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
def test_prefill_schedule_max_seqs():
"""
Test max seq respected.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
budget = create_token_budget(max_num_seqs=2)
......@@ -508,15 +499,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool):
assert len(remaining_waiting) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
def test_prefill_schedule_max_lora():
"""
Test max lora is respected and prioritized.
"""
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
......@@ -563,14 +552,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool):
assert budget.num_batched_tokens == 60
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
def test_prefill_schedule_no_block_manager_capacity():
"""
Test sequence cannot be scheduled due to block manager has no capacity.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=128,
num_cpu_blocks=128)
budget = create_token_budget()
......@@ -607,14 +594,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager):
assert len(remaining_waiting) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_schedule_preempted(use_v2_block_manager: bool):
def test_decode_schedule_preempted():
"""
Test decodes cannot be scheduled and preempted.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None
......@@ -653,14 +638,12 @@ def test_decode_schedule_preempted(use_v2_block_manager: bool):
assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_decode_swap_beam_search(use_v2_block_manager: bool):
def test_decode_swap_beam_search():
"""
Test best_of > 1 swap out blocks
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=64,
num_cpu_blocks=64)
curr_loras = None
......@@ -709,14 +692,12 @@ def test_decode_swap_beam_search(use_v2_block_manager: bool):
assert output.blocks_to_copy == []
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
def test_schedule_decode_blocks_to_copy_update():
"""
Verify blocks_to_copy is updated.
"""
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=4,
scheduler = initialize_scheduler(block_size=4,
num_cpu_blocks=16,
num_gpu_blocks=16)
_, seq_group = create_dummy_prompt("1",
......@@ -747,11 +728,9 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool):
assert output.blocks_to_copy == [(2, 3)]
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_simple(use_v2_block_manager: bool):
def test_schedule_swapped_simple():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size)
scheduler = initialize_scheduler(block_size=block_size)
curr_loras = None
blocks_to_swap_out: List[Tuple[int, int]] = []
_, seq_group = create_dummy_prompt("1",
......@@ -778,11 +757,9 @@ def test_schedule_swapped_simple(use_v2_block_manager: bool):
assert blocks_to_swap_out == blocks_to_swap_in_reverse
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
def test_schedule_swapped_max_token_budget():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -815,11 +792,9 @@ def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
def test_schedule_swapped_max_seqs():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
curr_loras = None
......@@ -852,12 +827,10 @@ def test_schedule_swapped_max_seqs(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
def test_schedule_swapped_max_loras():
block_size = 4
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
......@@ -887,11 +860,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool):
assert len(curr_loras) == 1
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
def test_schedule_swapped_cannot_swap_in():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -920,11 +891,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_infeasible_swap(use_v2_block_manager: bool):
def test_infeasible_swap():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......@@ -954,11 +923,9 @@ def test_infeasible_swap(use_v2_block_manager: bool):
assert len(output.prefill_seq_groups) == 0
@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool):
def test_schedule_swapped_blocks_to_copy():
block_size = 4
scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager,
block_size=block_size,
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
curr_loras = None
......
......@@ -13,9 +13,10 @@ def create_dummy_prompt(
prompt_length: int,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1,
prompt_tokens: Optional[List[int]] = None,
min_tokens: int = 0,
max_tokens: int = 16,
) -> Tuple[Sequence, SequenceGroup]:
if not block_size:
block_size = prompt_length
......@@ -35,8 +36,9 @@ def create_dummy_prompt(
seqs=[prompt],
arrival_time=time.time(),
sampling_params=SamplingParams(
use_beam_search=use_beam_search,
best_of=best_of),
best_of=best_of,
max_tokens=max_tokens,
min_tokens=min_tokens),
lora_request=lora_request)
return prompt, seq_group
......@@ -48,7 +50,6 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length: int,
block_size: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
use_beam_search: bool = False,
best_of: int = 1,
) -> Tuple[Sequence, Sequence, SequenceGroup]:
if not block_size:
......@@ -81,9 +82,7 @@ def create_dummy_prompt_encoder_decoder(
from_decoder_prompt=False)
seq_group = SequenceGroup(request_id=request_id,
seqs=[decoder_prompt],
sampling_params=SamplingParams(
use_beam_search=use_beam_search,
best_of=best_of),
sampling_params=SamplingParams(best_of=best_of),
arrival_time=time.time(),
lora_request=lora_request,
encoder_seq=encoder_prompt)
......
port: 12312
served_model_name: mymodel
tensor_parallel_size: 2
......@@ -6,10 +6,10 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
to fail.
"""
import os
from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional
import pytest
from packaging import version
from transformers import __version__ as transformers_version
from vllm.logger import init_logger
......@@ -20,52 +20,253 @@ logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.mark.parametrize(
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
"MODEL_NAME, DIST_BACKEND"),
[
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
# NOTE: InternVL2 multi-node tests are flaky,
# use mp backend to skip the multi-node tests
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
(1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
],
)
@fork_new_process_for_each_test
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
class ParallelSetup(NamedTuple):
tp_size: int
pp_size: int
eager_mode: bool
chunked_prefill: bool
@dataclass
class PPTestSettings:
parallel_setups: List[ParallelSetup]
distributed_backends: List[str]
trust_remote_code: bool
tokenizer_mode: Optional[str]
@staticmethod
def detailed(
*,
tp_base: int = 1,
pp_base: int = 2,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
eager_mode=False,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
eager_mode=True,
chunked_prefill=False),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=2 * tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp", "ray"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
@staticmethod
def fast(
*,
tp_base: int = 1,
pp_base: int = 2,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
eager_mode=True,
chunked_prefill=False),
],
distributed_backends=["mp"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
def iter_params(self, model_name: str):
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
GENERATION_MODEL_SETTINGS = {
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"bigscience/bloomz-1b1": PPTestSettings.fast(),
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-2b": PPTestSettings.fast(),
"google/gemma-2-9b": PPTestSettings.fast(),
"gpt2": PPTestSettings.fast(),
"bigcode/starcoder": PPTestSettings.fast(),
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
"EleutherAI/pythia-12b": PPTestSettings.fast(),
"ibm/PowerLM-3b": PPTestSettings.fast(),
"ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
"core42/jais-13b-chat": PPTestSettings.fast(),
# TODO: Implement PP
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
# FIXME: Cannot load tokenizer in latest transformers version
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}
EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
}
MULTIMODAL_MODEL_SETTINGS = {
# [FAST TESTS]
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}
CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated]
# [FAST TESTS]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS = [
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B",
"ibm/PowerLM-3b",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B",
"microsoft/Phi-3-vision-128k-instruct",
"fixie-ai/ultravox-v0_3",
]
def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available: int,
*,
method: Literal["generate", "encode"] = "encode",
):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")
# Skip tests that require transformers>=4.45.0
if "Qwen2-VL" in MODEL_NAME and version.parse(
transformers_version) < version.parse("4.45.0.dev0"):
pytest.skip("This test requires transformers>=4.45.0")
pp_args = [
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"8192",
"2048",
"--max-num-seqs",
"8",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
and chunked_prefill):
# Test Ray ADAG for a subset of the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
common_args.append("--disable-frontend-multiprocessing")
else:
pp_env = None
pp_args = [
*common_args,
"--pipeline-parallel-size",
str(PP_SIZE),
str(pp_size),
"--tensor-parallel-size",
str(TP_SIZE),
str(tp_size),
"--distributed-executor-backend",
DIST_BACKEND,
distributed_backend,
]
# compare without pipeline parallelism
......@@ -74,44 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"8192",
*common_args,
"--tensor-parallel-size",
str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.
str(tp_size),
"--distributed-executor-backend",
"mp",
]
if CHUNKED_PREFILL:
pp_args.append("--enable-chunked-prefill")
tp_args.append("--enable-chunked-prefill")
if EAGER_MODE:
pp_args.append("--enforce-eager")
tp_args.append("--enforce-eager")
if TRUST_REMOTE_CODE:
pp_args.append("--trust-remote-code")
tp_args.append("--trust-remote-code")
pp_env = None
if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
and CHUNKED_PREFILL):
# Test Ray ADAG for a subset of the tests
pp_env = {
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
pp_args.append("--disable-frontend-multiprocessing")
tp_args.append("--disable-frontend-multiprocessing")
try:
compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
compare_two_settings(model_name,
pp_args,
tp_args,
pp_env,
method=method)
except Exception:
if pp_env is None:
raise
else:
# Ray ADAG tests are flaky, so we don't want to fail the test
logger.exception("Ray ADAG tests failed")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="encode")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
[
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
num_gpus_available,
method="generate")
......@@ -42,22 +42,42 @@ def test_bad_nullable_kvs(arg):
nullable_kvs(arg)
@pytest.mark.parametrize(("arg", "expected"), [
(None, None),
("{}", {}),
('{"num_crops": 4}', {
"num_crops": 4
}),
('{"foo": {"bar": "baz"}}', {
"foo": {
"bar": "baz"
}
}),
# yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
(None, None, "mm-processor-kwargs"),
("{}", {}, "mm-processor-kwargs"),
(
'{"num_crops": 4}',
{
"num_crops": 4
},
"mm-processor-kwargs"
),
(
'{"foo": {"bar": "baz"}}',
{
"foo":
{
"bar": "baz"
}
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
{
"cast_logits_dtype": "bfloat16",
"sequence_parallel_norm": True,
"sequence_parallel_norm_threshold": 2048,
},
"override-neuron-config"
),
])
def test_mm_processor_kwargs_prompt_parser(arg, expected):
# yapf: enable
def test_composite_arg_parser(arg, expected, option):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--mm-processor-kwargs", arg])
assert args.mm_processor_kwargs == expected
args = parser.parse_args([f"--{option}", arg])
assert getattr(args, option.replace("-", "_")) == expected
......@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir):
def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")
......@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir):
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")
......
......@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params)
v2_output = llm.encode(prompt, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
......@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params)
v2_output = llm.encode(PROMPTS, pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.encode(
[{
"prompt": p
} for p in PROMPTS],
pooling_params=pooling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()
......
......@@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt', PROMPTS)
def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=prompt,
sampling_params=sampling_params)
v2_output = llm.generate(prompt, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate({"prompt": prompt},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
......@@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompts'"):
v1_output = llm.generate(prompts=PROMPTS,
sampling_params=sampling_params)
v2_output = llm.generate(PROMPTS, sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
v2_output = llm.generate(
[{
"prompt": p
} for p in PROMPTS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
......
......@@ -7,7 +7,7 @@ import pytest
from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ...conftest import cleanup
......@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
)
outputs = llm.generate(
prompts=[
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
guided_decoding=GuidedDecodingParams(regex=sample_regex))
outputs = llm.generate(prompts=[
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
......@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
)
outputs = llm.generate(
prompts=[
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_json=sample_json_schema))
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
outputs = llm.generate(prompts=[
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
......@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
)
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
outputs = llm.generate(
prompts="The best language for type-safe systems programming is ",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_choice=sample_guided_choice))
use_tqdm=True)
assert outputs is not None
for output in outputs:
......@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature=0.8,
top_p=0.95,
max_tokens=1000,
)
guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
outputs = llm.generate(
prompts=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"),
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_grammar=sample_sql_statements))
)
assert outputs is not None
for output in outputs:
......@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
assert generated_text.strip() == ground_truth
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
def test_guided_options_request_deprecation_warning(sample_regex, llm):
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
with pytest.warns(DeprecationWarning, match="guided_options_request"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
@pytest.mark.skip_global_cleanup
def test_validation_against_both_guided_decoding_options(sample_regex, llm):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
with pytest.raises(ValueError, match="Cannot set both"):
llm.generate(prompts="This should fail",
sampling_params=sampling_params,
use_tqdm=True,
guided_options_request=dict(guided_regex=sample_regex))
......@@ -21,7 +21,9 @@ def server():
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
]
......
from http import HTTPStatus
from typing import List
import openai
import pytest
......@@ -12,8 +13,44 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if not hasattr(request, "param"):
return []
val = request.param
if isinstance(val, str):
return [val]
return request.param
@pytest.fixture(scope="module")
def server():
def server(server_args):
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
......@@ -23,6 +60,7 @@ def server():
"--enforce-eager",
"--max-num-seqs",
"128",
*server_args,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -35,6 +73,15 @@ async def client(server):
yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
......@@ -45,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing")
],
indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
......
......@@ -433,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
model=model_name,
messages=messages,
max_tokens=10,
extra_body=dict(min_tokens=10),
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True
"continuous_usage_stats": True,
},
)
last_completion_tokens = 0
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert last_completion_tokens == 0 or \
chunk.usage.completion_tokens > last_completion_tokens or \
(
not chunk.choices and
chunk.usage.completion_tokens == last_completion_tokens
)
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
last_completion_tokens = chunk.usage.completion_tokens
assert last_completion_tokens == 10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
......
......@@ -12,7 +12,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
......@@ -20,12 +20,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
"""),
("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of""")
What is the capital of"""),
("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""),
]
TEST_MESSAGES = [
......@@ -42,6 +50,10 @@ TEST_MESSAGES = [
'content': 'What is the capital of'
},
]
ASSISTANT_MESSAGE_TO_CONTINUE = {
'role': 'assistant',
'content': 'The capital of'
}
def test_load_chat_template():
......@@ -73,10 +85,10 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize(
"model,template,add_generation_prompt,expected_output",
"model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATON_OUTPUT)
def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output):
continue_final_message, expected_output):
# Initialize the tokenizer
tokenizer = get_tokenizer(tokenizer_name=model)
template_content = load_chat_template(chat_template=template)
......@@ -84,8 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest(
model=model,
messages=TEST_MESSAGES,
add_generation_prompt=add_generation_prompt)
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
if continue_final_message else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result
result = apply_hf_chat_template(
......@@ -93,6 +108,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
)
# Test assertion
......
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--max-num-seqs",
"128",
"--enable-chunked-prefill",
"--max-num-batched-tokens",
"1000",
# large prompts create a lot of output
"--disable-log-requests",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
prompt = "What is the capital of France?" * 400
stream = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=5,
)
tokens_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
tokens_received += 1
assert chunk.choices[0].text
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
@pytest.mark.asyncio
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
client: openai.AsyncOpenAI):
# Test stream with long prompt
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is the capital of France?" * 400
}]
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True,
},
logprobs=True,
top_logprobs=5,
)
tokens_received = 0
empty_chunks_received = 0
finished = False
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if not finished:
if chunk.choices[0].delta.content == "":
# when there is no tokens generated
assert chunk.usage.completion_tokens == 0
assert chunk.choices[0].logprobs is None
empty_chunks_received += 1
else:
tokens_received += 1
if chunk.choices[0].finish_reason is not None:
finished = True
if finished:
assert chunk.usage.completion_tokens == tokens_received
assert empty_chunks_received <= 1
import json
import unittest
from vllm.entrypoints.openai.cli_args import make_arg_parser
import pytest
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args)
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
from vllm.utils import FlexibleArgumentParser
from ...utils import VLLM_PATH
LORA_MODULE = {
"name": "module2",
"path": "/path/to/module2",
"base_model_name": "llama"
}
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
assert CHATML_JINJA_PATH.exists()
class TestLoraParserAction(unittest.TestCase):
@pytest.fixture
def serve_parser():
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
return make_arg_parser(parser)
def setUp(self):
# Setting up argparse parser for tests
parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
self.parser = make_arg_parser(parser)
def test_valid_key_value_format(self):
# Test old format: name=path
args = self.parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
### Tests for Lora module parsing
def test_valid_key_value_format(serve_parser):
# Test old format: name=path
args = serve_parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
assert args.lora_modules == expected
def test_valid_json_format(serve_parser):
# Test valid JSON format input
args = serve_parser.parse_args([
'--lora-modules',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
def test_invalid_json_format(serve_parser):
# Test invalid JSON format input, missing closing brace
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
])
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
self.assertEqual(args.lora_modules, expected)
def test_valid_json_format(self):
# Test valid JSON format input
args = self.parser.parse_args([
def test_invalid_type_error(serve_parser):
# Test type error when values are not JSON or key=value
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules',
json.dumps(LORA_MODULE),
'invalid_format' # This is not JSON or key=value format
])
expected = [
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
def test_invalid_json_format(self):
# Test invalid JSON format input, missing closing brace
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module3", "path": "/path/to/module3"'
])
def test_invalid_type_error(self):
# Test type error when values are not JSON or key=value
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'invalid_format' # This is not JSON or key=value format
])
def test_invalid_json_field(self):
# Test valid JSON format but missing required fields
with self.assertRaises(SystemExit):
self.parser.parse_args([
'--lora-modules',
'{"name": "module4"}' # Missing required 'path' field
])
def test_empty_values(self):
# Test when no LoRA modules are provided
args = self.parser.parse_args(['--lora-modules', ''])
self.assertEqual(args.lora_modules, [])
def test_multiple_valid_inputs(self):
# Test multiple valid inputs (both old and JSON format)
args = self.parser.parse_args([
def test_invalid_json_field(serve_parser):
# Test valid JSON format but missing required fields
with pytest.raises(SystemExit):
serve_parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
json.dumps(LORA_MODULE),
'{"name": "module4"}' # Missing required 'path' field
])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
self.assertEqual(args.lora_modules, expected)
if __name__ == '__main__':
unittest.main()
def test_empty_values(serve_parser):
# Test when no LoRA modules are provided
args = serve_parser.parse_args(['--lora-modules', ''])
assert args.lora_modules == []
def test_multiple_valid_inputs(serve_parser):
# Test multiple valid inputs (both old and JSON format)
args = serve_parser.parse_args([
'--lora-modules',
'module1=/path/to/module1',
json.dumps(LORA_MODULE),
])
expected = [
LoRAModulePath(name='module1', path='/path/to/module1'),
LoRAModulePath(name='module2',
path='/path/to/module2',
base_model_name='llama')
]
assert args.lora_modules == expected
### Tests for serve argument validation that run prior to loading
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
"""Ensure validation passes with tool choice enabled with a call parser"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--tool-call-parser",
"mistral",
])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args(
args=["--chat-template",
CHATML_JINJA_PATH.absolute().as_posix()])
validate_parsed_serve_args(args)
def test_chat_template_validation_for_sad_paths(serve_parser):
"""Ensure validation fails if the chat template doesn't exist"""
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
with pytest.raises(ValueError):
validate_parsed_serve_args(args)
......@@ -503,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
# NOTE: this has to be true for n > 1 in vLLM, but
# not necessary for official client.
use_beam_search=True),
)
assert len(batch.choices) == 4
......
......@@ -144,3 +144,64 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
0].embedding
assert responses_float.data[1].embedding == responses_default.data[
1].embedding
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding_truncation(
embedding_client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding_truncation_invalid(
embedding_client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message
......@@ -70,7 +70,6 @@ EXPECTED_VALUES = {
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens":
......@@ -151,9 +150,6 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_best_of_sum",
"vllm:request_params_best_of_bucket",
"vllm:request_params_best_of_count",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment