Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
# SPDX-License-Identifier: Apache-2.0
import pytest
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.protocol.instruct.tool_calls import Function, Tool
from vllm.transformers_utils.tokenizers.mistral import (
make_mistral_chat_completion_request)
# yapf: enable
@pytest.mark.parametrize(
"openai_request,expected_mistral_request",
[(
{
"messages": [{
"role": "user",
"content": "What is the current local date and time?",
}],
"tools": [{
"type": "function",
"function": {
"description": "Fetch the current local date and time.",
"name": "get_current_time",
},
}],
},
ChatCompletionRequest(
messages=[
UserMessage(content="What is the current local date and time?")
],
tools=[
Tool(
type="function",
function=Function(
name="get_current_time",
description="Fetch the current local date and time.",
parameters={},
),
)
],
),
)],
)
def test_make_mistral_chat_completion_request(openai_request,
expected_mistral_request):
assert (make_mistral_chat_completion_request(
openai_request["messages"],
openai_request["tools"]) == expected_mistral_request)
# SPDX-License-Identifier: Apache-2.0
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
TokenizerRegistry)
if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
class TestTokenizer(TokenizerBase):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
return TestTokenizer()
@property
def all_special_tokens_extended(self) -> List[str]:
raise NotImplementedError()
@property
def all_special_tokens(self) -> List[str]:
raise NotImplementedError()
@property
def all_special_ids(self) -> List[int]:
raise NotImplementedError()
@property
def bos_token_id(self) -> int:
return 0
@property
def eos_token_id(self) -> int:
return 1
@property
def sep_token(self) -> str:
raise NotImplementedError()
@property
def pad_token(self) -> str:
raise NotImplementedError()
@property
def is_fast(self) -> bool:
raise NotImplementedError()
@property
def vocab_size(self) -> int:
raise NotImplementedError()
@property
def max_token_id(self) -> int:
raise NotImplementedError()
def __call__(
self,
text: Union[str, List[str], List[int]],
text_pair: Optional[str] = None,
add_special_tokens: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
):
raise NotImplementedError()
def get_vocab(self) -> Dict[str, int]:
raise NotImplementedError()
def get_added_vocab(self) -> Dict[str, int]:
raise NotImplementedError()
def encode_one(
self,
text: str,
truncation: bool = False,
max_length: Optional[int] = None,
) -> List[int]:
raise NotImplementedError()
def encode(self,
text: str,
add_special_tokens: Optional[bool] = None) -> List[int]:
raise NotImplementedError()
def apply_chat_template(self,
messages: List["ChatCompletionMessageParam"],
tools: Optional[List[Dict[str, Any]]] = None,
**kwargs) -> List[int]:
raise NotImplementedError()
def convert_tokens_to_string(self, tokens: List[str]) -> str:
raise NotImplementedError()
def decode(self,
ids: Union[List[int], int],
skip_special_tokens: bool = True) -> str:
raise NotImplementedError()
def convert_ids_to_tokens(
self,
ids: List[int],
skip_special_tokens: bool = True,
) -> List[str]:
raise NotImplementedError()
def test_customized_tokenizer():
TokenizerRegistry.register("test_tokenizer",
"tests.tokenization.test_tokenizer_registry",
"TestTokenizer")
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
assert isinstance(tokenizer, TestTokenizer)
assert tokenizer.bos_token_id == 0
assert tokenizer.eos_token_id == 1
......@@ -52,8 +52,9 @@ if current_platform.is_rocm():
finally:
amdsmi_shut_down()
elif current_platform.is_cuda():
from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
nvmlInit, nvmlShutdown)
from vllm.third_party.pynvml import (nvmlDeviceGetHandleByIndex,
nvmlDeviceGetMemoryInfo, nvmlInit,
nvmlShutdown)
@contextmanager
def _nvml():
......
......@@ -5,10 +5,11 @@ import pytest
from vllm.multimodal.inputs import MultiModalKwargs
from vllm.sampling_params import SamplingParams
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
KVCacheBlock,
KVCacheBlock, PrefixCachingMetrics,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens)
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request
......@@ -163,7 +164,7 @@ def test_generate_block_hash_extra_keys():
# Test with no overlap
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
assert extra_keys == ()
assert extra_keys is None
assert next_mm_idx == 1
# Test with multiple extra keys
......@@ -277,3 +278,39 @@ def test_hash_request_tokens_no_mm_inputs():
assert block_hashes[0].extra_keys is None
assert block_hashes[1].token_ids == (3, 4, 5)
assert block_hashes[1].extra_keys is None
def test_metrics():
"""
Test the prefix caching metrics.
"""
def stats(requests, queries, hits):
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
metrics = PrefixCachingMetrics(interval=5)
assert metrics.hit_rate == 0.0
metrics.observe(stats(1, 20, 9))
# 9 / 20 = 0.45
assert metrics.hit_rate == 0.45
metrics.observe(stats(4, 80, 16))
# 25 / 100 = 0.25
assert metrics.hit_rate == 0.25
metrics.observe(stats(1, 10, 2))
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
assert metrics.aggregated_requests == 5
assert metrics.aggregated_query_total == 90
assert metrics.aggregated_query_hit == 18
assert metrics.hit_rate == 0.2
metrics.reset()
assert metrics.hit_rate == 0.0
assert metrics.aggregated_requests == 0
assert metrics.aggregated_query_total == 0
assert metrics.aggregated_query_hit == 0
assert not metrics.query_queue
......@@ -51,7 +51,7 @@ def test_prefill():
all_token_ids = common_token_ids + unique_token_ids
req0 = make_request("0", all_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert len(req0.kv_block_hashes) == 3
assert len(manager.req_to_block_hashes[req0.request_id]) == 3
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
......@@ -76,7 +76,7 @@ def test_prefill():
unique_token_ids = [3] * 5
req1 = make_request("1", common_token_ids + unique_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert len(req1.kv_block_hashes) == 3
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
......@@ -107,7 +107,7 @@ def test_prefill():
unique_token_ids = [3] * 6
req2 = make_request("2", common_token_ids + unique_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert len(req2.kv_block_hashes) == 3
assert len(manager.req_to_block_hashes[req2.request_id]) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
......@@ -494,10 +494,11 @@ def test_mm_prefix_caching():
# Completed block should have hashes with extra keys.
assert not computed_blocks
assert num_computed_tokens == 0
assert len(req0.kv_block_hashes) == 3
assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
block_hashes = manager.req_to_block_hashes[req0.request_id]
assert len(block_hashes) == 3
assert block_hashes[0].extra_keys == ("aaa", )
assert block_hashes[1].extra_keys == ("aaa", "bbb")
assert block_hashes[2].extra_keys == ("bbb", )
blocks = manager.allocate_slots(req0, 59, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
......@@ -510,8 +511,8 @@ def test_mm_prefix_caching():
assert new_blocks is not None and len(new_blocks) == 0
# The just completed block should have hashes with extra keys.
assert len(req0.kv_block_hashes) == 4
assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
assert len(block_hashes) == 4
assert block_hashes[3].extra_keys == ("ccc", )
# Cache hit.
unique_token_ids = [-1] * 7 + [200] * 5
......@@ -613,7 +614,7 @@ def test_reset_prefix_cache():
all_token_ids = full_block_token_ids + unique_token_ids
req1 = make_request("1", all_token_ids)
computed_blocks, _ = manager.get_computed_blocks(req1)
assert len(req1.kv_block_hashes) == 3
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
assert len(computed_blocks) == 3
blocks = manager.allocate_slots(req1, 7, computed_blocks)
assert [b.block_id for b in blocks] == [4]
......
......@@ -4,10 +4,12 @@ from typing import List, Optional
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.v1.core.scheduler import Scheduler
from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
EOS_TOKEN_ID = 50256
def create_scheduler(
model: str = "facebook/opt-125m",
......@@ -38,15 +40,21 @@ def create_scheduler(
return Scheduler(scheduler_config,
model_config,
cache_config,
lora_config=None)
speculative_config=None,
lora_config=None,
log_stats=True)
def create_requests(
num_requests: int,
num_tokens: int = 10,
mm_positions: Optional[List[PlaceholderRange]] = None,
max_tokens: int = 16,
stop_token_ids: Optional[List[int]] = None,
):
sampling_params = SamplingParams()
sampling_params = SamplingParams(ignore_eos=False,
max_tokens=max_tokens,
stop_token_ids=stop_token_ids)
requests = []
for i in range(num_requests):
if mm_positions is not None:
......@@ -63,7 +71,7 @@ def create_requests(
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=None,
eos_token_id=EOS_TOKEN_ID,
arrival_time=0,
)
requests.append(request)
......@@ -194,9 +202,10 @@ def test_schedule_partial_requests():
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[0] * len(requests),
logprob_token_ids_cpu=None,
logprobs_cpu=None,
sampled_token_ids=[[0] for _ in range(len(requests))],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(output, model_runner_output)
......@@ -212,3 +221,243 @@ def test_schedule_partial_requests():
assert output.num_scheduled_tokens[requests[0].request_id] == 1
assert output.num_scheduled_tokens[requests[1].request_id] == 700
assert requests[2].request_id not in output.num_scheduled_tokens
def test_stop_via_update_from_output():
"""Test stopping behavior through update_from_output"""
scheduler = create_scheduler()
# Test case 1: Stop on EOS token
requests = create_requests(num_requests=2, max_tokens=10)
for req in requests:
req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req
scheduler.running.append(req)
scheduler.scheduled_req_ids.add(req.request_id)
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 1,
requests[1].request_id: 2
},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [],
requests[1].request_id: [10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID],
[10,
11]], # First request hits EOS, second continues
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={})
scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped, second continues
assert len(scheduler.running) == 1
assert scheduler.running[0].request_id == requests[1].request_id
assert requests[0].status == RequestStatus.FINISHED_STOPPED
assert requests[0].request_id in scheduler.finished_req_ids
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
assert list(requests[1].output_token_ids) == [10, 11]
# Test case 2: Stop on custom stop token
scheduler = create_scheduler()
requests = create_requests(num_requests=2,
max_tokens=10,
stop_token_ids=[42, 43])
for req in requests:
req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req
scheduler.running.append(req)
scheduler.scheduled_req_ids.add(req.request_id)
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 2
},
total_num_scheduled_tokens=5,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 42],
requests[1].request_id: [13]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={})
scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped on custom token
assert len(scheduler.running) == 1
assert scheduler.running[0].request_id == requests[1].request_id
assert requests[0].status == RequestStatus.FINISHED_STOPPED
assert requests[0].stop_reason == 42
assert requests[0].request_id in scheduler.finished_req_ids
assert list(requests[0].output_token_ids) == [10, 42]
assert list(requests[1].output_token_ids) == [13, 14]
# Test case 3: Stop on max tokens
scheduler = create_scheduler()
requests = create_requests(num_requests=2, max_tokens=2)
for req in requests:
req.num_computed_tokens = req.num_tokens
scheduler.requests[req.request_id] = req
scheduler.running.append(req)
scheduler.scheduled_req_ids.add(req.request_id)
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 1
},
total_num_scheduled_tokens=4,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 11],
requests[1].request_id: []
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={})
scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped due to length
assert len(scheduler.running) == 1
assert scheduler.running[0].request_id == requests[1].request_id
assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
assert requests[0].request_id in scheduler.finished_req_ids
assert list(requests[0].output_token_ids) == [10, 11
] # Truncated to max_tokens
assert list(requests[1].output_token_ids) == [13]
# Test case 4: Ignore EOS flag
scheduler = create_scheduler()
requests = create_requests(num_requests=1, max_tokens=10)
requests[0].sampling_params.ignore_eos = True
requests[0].num_computed_tokens = requests[0].num_tokens
scheduler.requests[requests[0].request_id] = requests[0]
scheduler.running.append(requests[0])
scheduler.scheduled_req_ids.add(requests[0].request_id)
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={requests[0].request_id: 3},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [EOS_TOKEN_ID, 10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={})
scheduler.update_from_output(scheduler_output, model_output)
# Verify request continues past EOS
assert len(scheduler.running) == 1
assert not requests[0].is_finished()
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
def test_schedule_concurrent_batches():
scheduler = create_scheduler(
max_num_batched_tokens=1024,
max_num_seqs=2,
)
requests = create_requests(
num_requests=2,
num_tokens=512,
)
# Schedule the first request.
scheduler.add_request(requests[0])
scheduler_output0 = scheduler.schedule()
assert len(scheduler_output0.scheduled_new_reqs) == 1
assert scheduler_output0.num_scheduled_tokens[
requests[0].request_id] == 512
# The first request is still running, so only schedule the second request.
scheduler.add_request(requests[1])
scheduler_output1 = scheduler.schedule()
assert len(scheduler_output1.scheduled_new_reqs) == 1
assert scheduler_output1.num_scheduled_tokens[
requests[1].request_id] == 512
# Model output of the first request.
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(scheduler_output0, model_runner_output)
# Schedule the next step.
# The first request can be scheduled again while the second
# request is still running.
scheduler_output2 = scheduler.schedule()
assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1
# Model output of the second request.
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(scheduler_output1, model_runner_output)
# SPDX-License-Identifier: Apache-2.0
import pytest
from vllm import LLM, SamplingParams
@pytest.fixture
def test_prompts():
return [
"Can you repeat the sentence ten times, this is a sentence.",
"Can you repeat the sentence ten times, this is a test.",
]
@pytest.fixture
def sampling_config():
# Only support greedy for now
return SamplingParams(temperature=0, max_tokens=30, ignore_eos=False)
@pytest.fixture
def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct"
def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
model_name):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
ref_llm = LLM(model=model_name)
ref_outputs = ref_llm.generate(test_prompts, sampling_config)
del ref_llm
spec_llm = LLM(model=model_name,
speculative_model='[ngram]',
ngram_prompt_lookup_max=5,
ngram_prompt_lookup_min=3,
num_speculative_tokens=3)
spec_outputs = spec_llm.generate(test_prompts, sampling_config)
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
assert ref_output.outputs[0].text == spec_output.outputs[0].text, \
(f"ref_output: {ref_output.outputs[0].text},"
f"spec_output: {spec_output.outputs[0].text}")
del spec_llm
# SPDX-License-Identifier: Apache-2.0
from typing import List, Tuple
import pytest
import torch
from transformers import AutoTokenizer
from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN,
TOKENIZER_NAME,
DummyOutputProcessorTestVectors,
generate_dummy_prompt_logprobs_tensors,
generate_dummy_sample_logprobs)
from vllm.engine.arg_utils import EngineArgs
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from tests.v1.engine.utils import FULL_STRINGS # isort: skip
EngineCoreSampleLogprobsType = List[Tuple[torch.Tensor, torch.Tensor]]
EngineCorePromptLogprobsType = Tuple[torch.Tensor, torch.Tensor]
def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
"""Generate output processor dummy test vectors, without logprobs
Returns:
DummyOutputProcessorTestVectors instance with no logprobs
"""
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
# Tokenize prompts under test & create dummy generated tokens
prompt_tokens = [
tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
]
generation_tokens = [
tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
]
# Generate prompt strings
prompt_strings = [
tokenizer.decode(prompt_tokens, skip_special_tokens=True)
for prompt_tokens in prompt_tokens
]
prompt_strings_len = [
len(prompt_string) for prompt_string in prompt_strings
]
return DummyOutputProcessorTestVectors(
tokenizer=tokenizer,
tokenizer_group=init_tokenizer_from_configs(
vllm_config.model_config, vllm_config.scheduler_config,
vllm_config.parallel_config, vllm_config.lora_config),
vllm_config=vllm_config,
full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
prompt_tokens=prompt_tokens,
generation_tokens=generation_tokens,
prompt_strings=prompt_strings,
prompt_strings_len=prompt_strings_len,
generation_strings=[
text[prompt_len:]
for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
],
prompt_logprobs=[],
generation_logprobs=[])
@pytest.fixture
def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
"""Generate output processor dummy test vectors, with logprobs
Returns:
DummyOutputProcessorTestVectors instance with logprobs
"""
# Build dummy test vectors without logprobs
dtv = _build_test_vectors_no_logprobs()
# Inject logprobs into dummy test vectors
# data structure
dtv.generation_logprobs = [
generate_dummy_sample_logprobs(
sampled_tokens_list=tokens_list,
num_logprobs=NUM_SAMPLE_LOGPROBS_UNDER_TEST,
tokenizer=dtv.tokenizer) for tokens_list in dtv.generation_tokens
]
dtv.prompt_logprobs = [
generate_dummy_prompt_logprobs_tensors(
prompt_tokens_list=tokens_list,
num_logprobs=NUM_PROMPT_LOGPROBS_UNDER_TEST,
tokenizer=dtv.tokenizer) for tokens_list in dtv.prompt_tokens
]
return dtv
......@@ -2,13 +2,16 @@
import asyncio
from contextlib import ExitStack
from typing import List, Tuple
from typing import List, Optional, Tuple
import os
import pytest
from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
from vllm import SamplingParams
from vllm.assets.image import ImageAsset
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import PromptType
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
......@@ -18,20 +21,46 @@ if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.",
allow_module_level=True)
ENGINE_ARGS = AsyncEngineArgs(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
TEXT_ENGINE_ARGS = AsyncEngineArgs(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
enforce_eager=True,
disable_log_requests=True)
VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
enforce_eager=True,
disable_log_requests=True)
async def generate(engine: AsyncLLM, request_id: str,
TEXT_PROMPT = "Hello my name is Robert and"
VISION_PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
VISION_PROMPT = {
"prompt": VISION_PROMPT_TEMPLATE,
"multi_modal_data": {
"image": ImageAsset("stop_sign").pil_image
}
}
async def generate(engine: AsyncLLM,
request_id: str,
prompt: PromptType,
output_kind: RequestOutputKind,
max_tokens: int) -> Tuple[int, str]:
max_tokens: int,
prompt_logprobs: Optional[int] = None) -> Tuple[int, str]:
# Ensure generate doesn't complete too fast for cancellation test.
await asyncio.sleep(0.2)
count = 0
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True,
output_kind=output_kind,
temperature=0)
temperature=0,
prompt_logprobs=prompt_logprobs)
async for out in engine.generate(request_id=request_id,
prompt="Hello my name is Robert and",
prompt=prompt,
sampling_params=sampling_params):
num_tokens = len(out.outputs[0].token_ids)
......@@ -48,17 +77,58 @@ async def generate(engine: AsyncLLM, request_id: str,
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.asyncio
async def test_load(monkeypatch, output_kind: RequestOutputKind):
async def test_async_llm_refuses_prompt_logprobs_with_apc(
monkeypatch, output_kind: RequestOutputKind):
"""Test passes if AsyncLLM raises an exception when it is configured
for automatic prefix caching and it receives a request with
prompt_logprobs enabled, which is incompatible."""
# TODO(rickyx): Remove monkeypatch VLLM_USE_V1 setting once we have a
# better way to test V1 so that in the future when we switch, we don't
# have to change all the tests.
monkeypatch.setenv("VLLM_USE_V1", "1")
# Create AsyncLLM engine with APC
apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.8,
disable_log_requests=True)
engine = AsyncLLM.from_engine_args(apc_engine_args)
try:
with pytest.raises(ValueError) as excinfo:
# Issue a request with prompt logprobs enabled, which should fail
await asyncio.create_task(
generate(engine,
"request-0",
TEXT_PROMPT,
output_kind,
10,
prompt_logprobs=5))
# Validate exception string is correct
assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
finally:
# Shut down engine
engine.shutdown()
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.parametrize("engine_args_and_prompt",
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
async def test_load(monkeypatch, output_kind: RequestOutputKind,
engine_args_and_prompt: Tuple[AsyncEngineArgs,
PromptType]):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
engine_args, prompt = engine_args_and_prompt
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
NUM_REQUESTS = 10000
NUM_REQUESTS = 100
NUM_EXPECTED_TOKENS = 10
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
......@@ -68,7 +138,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, output_kind,
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS)))
# Confirm that we got all the EXPECTED tokens from the requests.
......@@ -87,13 +157,19 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.parametrize("engine_args_and_prompt",
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
async def test_abort(monkeypatch, output_kind: RequestOutputKind):
async def test_abort(monkeypatch, output_kind: RequestOutputKind,
engine_args_and_prompt: Tuple[AsyncEngineArgs,
PromptType]):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
engine_args, prompt = engine_args_and_prompt
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
NUM_REQUESTS = 100
......@@ -107,7 +183,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, output_kind,
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS)))
# API server cancels requests when they disconnect.
......@@ -133,7 +209,8 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
# Confirm we can do another generation.
request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
task = asyncio.create_task(
generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS))
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS
assert not engine.output_processor.has_unfinished_requests()
# SPDX-License-Identifier: Apache-2.0
import copy
import threading
import time
import uuid
from concurrent.futures import Future
import os
import pytest
......@@ -13,7 +16,9 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.executor.abstract import Executor
from vllm.v1.executor.abstract import Executor, UniProcExecutor
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.outputs import ModelRunnerOutput
from ...utils import models_path_prefix
if not current_platform.is_cuda():
......@@ -52,7 +57,8 @@ def test_engine_core(monkeypatch):
executor_class = Executor.get_class(vllm_config)
engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class)
executor_class=executor_class,
log_stats=True)
"""Test basic request lifecycle."""
# First request.
......@@ -159,7 +165,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
executor_class = Executor.get_class(vllm_config)
engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class)
executor_class=executor_class,
log_stats=True)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
......@@ -191,3 +198,85 @@ def test_engine_core_advanced_sampling(monkeypatch):
)
engine_core.add_request(request2)
_check_engine_state()
@fork_new_process_for_each_test
def test_engine_core_concurrent_batches(monkeypatch):
"""
Test that the engine can handle multiple concurrent batches.
"""
def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
request = make_request()
request.sampling_params.max_tokens = max_tokens
return request
class DummyExecutor(UniProcExecutor):
def initialize(self, kv_cache_config: KVCacheConfig) -> None:
super().initialize(kv_cache_config)
# This executor actually can only run 1 batch at a time
self.semaphore = threading.Semaphore(1)
def execute_model(
self,
scheduler_output,
) -> Future[ModelRunnerOutput]:
"""Make execute_model non-blocking."""
future: Future[ModelRunnerOutput] = Future()
def _thread_wrapper(scheduler_output, future):
with self.semaphore:
output = self.collective_rpc("execute_model",
args=(scheduler_output, ))
# Make a copy because output[0] may be reused
# by the next batch.
output = copy.deepcopy(output[0])
future.set_result(output)
threading.Thread(target=_thread_wrapper,
args=(scheduler_output, future)).start()
return future
@property
def max_concurrent_batches(self) -> int:
return 2
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(
model=MODEL_NAME,
# To test concurrent batches.
max_num_seqs=2,
# Avoid all requests being scheduled once.
enable_prefix_caching=False,
max_num_batched_tokens=10,
)
vllm_config = engine_args.create_engine_config()
engine_core = EngineCore(vllm_config=vllm_config,
log_stats=False,
executor_class=DummyExecutor)
assert engine_core.batch_queue is not None
# Add two requests in a row.
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
# First saturate the batch queue.
assert engine_core.step_with_batch_queue() is None
assert engine_core.batch_queue.qsize() == 1
assert engine_core.step_with_batch_queue() is None
assert engine_core.batch_queue.qsize() == 2
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Loop through both requests.
while engine_core.scheduler.get_num_unfinished_requests() == 2:
engine_core.step_with_batch_queue()
# Reaching here when got the result of the first request.
while engine_core.scheduler.get_num_unfinished_requests() == 1:
engine_core.step_with_batch_queue()
......@@ -3,7 +3,8 @@
import asyncio
import time
import uuid
from typing import Dict, List
from contextlib import ExitStack
from typing import Dict, List, Optional
import os
import pytest
......@@ -15,7 +16,9 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
SyncMPClient)
from vllm.v1.executor.abstract import Executor
from ...utils import models_path_prefix
......@@ -65,7 +68,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
while True:
engine_core_outputs = await client.get_output_async().outputs
engine_core_outputs = (await client.get_output_async()).outputs
if len(engine_core_outputs) == 0:
break
......@@ -80,6 +83,14 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
break
# Dummy utility function to monkey-patch into engine core.
def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
print(f"echo util function called: {msg}, {err_msg}")
if err_msg is not None:
raise ValueError(err_msg)
return msg
@fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
......@@ -87,7 +98,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
vllm_config = engine_args.create_engine_config(
UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
......@@ -96,6 +110,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
MAX_TOKENS = 20
......@@ -148,15 +163,30 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
client.abort_requests([request.request_id])
if multiprocessing_mode:
"""Utility method invocation"""
@fork_new_process_for_each_test
@pytest.mark.asyncio
core_client: SyncMPClient = client
result = core_client._call_utility("echo", "testarg")
assert result == "testarg"
with pytest.raises(Exception) as e_info:
core_client._call_utility("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!"
@pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch):
with monkeypatch.context() as m:
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME)
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
......@@ -165,7 +195,9 @@ async def test_engine_core_client_asyncio(monkeypatch):
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=True,
)
after.callback(client.shutdown)
MAX_TOKENS = 20
params = SamplingParams(max_tokens=MAX_TOKENS)
......@@ -204,3 +236,14 @@ async def test_engine_core_client_asyncio(monkeypatch):
else:
assert len(outputs[req_id]) == MAX_TOKENS, (
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
"""Utility method invocation"""
core_client: AsyncMPClient = client
result = await core_client._call_utility_async("echo", "testarg")
assert result == "testarg"
with pytest.raises(Exception) as e_info:
await core_client._call_utility_async("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!"
# SPDX-License-Identifier: Apache-2.0
import pytest
from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
from vllm import LLM, SamplingParams
def test_llm_engine_refuses_prompt_logprobs_with_apc(monkeypatch):
"""Test passes if LLMEngine raises an exception when it is configured
for automatic prefix caching and it receives a request with
prompt_logprobs enabled, which is incompatible."""
monkeypatch.setenv("VLLM_USE_V1", "1")
# TODO(nick): Single-proc to work around a ZMQ shutdown hang for now.
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with pytest.raises(ValueError) as excinfo:
LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
"Hello, my name is",
SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5))
# Validate exception string is correct
assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
# SPDX-License-Identifier: Apache-2.0
from typing import List
import math
import time
from typing import Dict, List, Optional
import os
import pytest
from transformers import AutoTokenizer
from vllm.engine.arg_utils import EngineArgs
from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
NUM_SAMPLE_LOGPROBS_UNDER_TEST,
STOP_STRINGS,
DummyOutputProcessorTestVectors,
MockEngineCore)
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.sequence import PromptLogprobs, SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import OutputProcessor
from ...utils import models_path_prefix
TOKENIZER_NAME = os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3")
VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
VLLM_CONFIG.scheduler_config,
VLLM_CONFIG.parallel_config,
VLLM_CONFIG.lora_config)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
FULL_STRINGS = [
"My name is Robert from Neural Magic and I love working on vLLM so much!",
"Red Hat is the best open source company by far across Linux, K8s, and AI.",
"Nick is the name of my brother in addition to my colleague from Red Hat.",
]
STOP_STRINGS = ["I love working on", "company by far", "brother in"]
FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
PROMPT_LEN = 5
PROMPT_TOKENS = [
tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
]
GENERATION_TOKENS = [
tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
]
PROMPT_STRINGS = [
tokenizer.decode(prompt_tokens, skip_special_tokens=True)
for prompt_tokens in PROMPT_TOKENS
]
PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
GENERATION_STRINGS = [
text[prompt_len:]
for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
]
class MockEngineCore:
"""Mock outputs form premade tokens lists."""
def __init__(self, tokens_list: List[List[int]]):
self.tokens_list = tokens_list
self.current_idx = 0
def get_outputs(self) -> List[EngineCoreOutput]:
token_idx = self.current_idx
self.current_idx += 1
outputs = []
for req_idx, token_ids in enumerate(self.tokens_list):
if len(token_ids) > token_idx:
output = EngineCoreOutput(request_id=f"request-{req_idx}",
new_token_ids=[token_ids[token_idx]],
finished=False)
if token_idx == len(token_ids) - 1:
output.finished = True
output.finish_reason = "stopped"
outputs.append(output)
return outputs
from vllm.v1.metrics.stats import IterationStats
def _ref_convert_id_to_token(
tokenizer: AnyTokenizer,
token_id: int,
) -> str:
"""Reference impl of logprobs detokenization.
Args:
tokenizer: tokenizer used by the model under test
token_id: convert this token id
Returns:
String representation of input token id
"""
return tokenizer.convert_ids_to_tokens(token_id) or ""
@pytest.mark.parametrize(
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
def test_incremental_detokenization(request_output_kind: RequestOutputKind):
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
def test_incremental_detokenization(request_output_kind: RequestOutputKind,
dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens)
# Make N requests.
requests = [
......@@ -97,10 +62,10 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
spaces_between_special_tokens=False,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False))
for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
include_stop_str_in_output=False,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
]
# Add requests to the detokenizer.
......@@ -116,7 +81,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
break
# Step the Detokenizer.
processed_outputs = output_processor.process_outputs(outputs, )
processed_outputs = output_processor.process_outputs(outputs)
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
assert len(requests_to_abort) == 0
......@@ -135,7 +100,8 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
# Confirmed tracked values matches what we expected.
for idx, (ref_gen_str, ref_gen_toks) in enumerate(
zip(GENERATION_STRINGS, GENERATION_TOKENS)):
zip(dummy_test_vectors.generation_strings,
dummy_test_vectors.generation_tokens)):
gen_str = gen_strings[f"request-{idx}"]
gen_toks = gen_tokens[f"request-{idx}"]
......@@ -146,15 +112,390 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
assert not output_processor.has_unfinished_requests()
def _validate_logprobs(
gen_tokens: Dict[str, List[int]],
gen_logprobs: Dict[str, Optional[SampleLogprobs]],
gen_prompt_logprobs: Dict[str, Optional[PromptLogprobs]],
gen_cumulative_logprob: Dict[str, float],
dtv: DummyOutputProcessorTestVectors,
request_id_list: List[str],
num_sample_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
) -> None:
for req_idx, req_id in enumerate(request_id_list):
new_tokens = gen_tokens[req_id]
logprobs = gen_logprobs[req_id]
prompt_logprobs = gen_prompt_logprobs[req_id]
cumulative_logprob = gen_cumulative_logprob[req_id]
prompt_token_ids = dtv.prompt_tokens[req_idx]
ref_logprobs = dtv.generation_logprobs[req_idx]
ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
if num_sample_logprobs is not None:
# Validate sample logprobs
assert logprobs is not None, (f"Request {req_id} requires sample"
" logprobs but sample logprobs are"
" None.")
# Require num sampled tokens to match num
# sampled logprobs - especially important
# to check since the detokenizer can cause
# a request to finish early due to a stop
# string being hit
num_new_tokens = len(new_tokens)
len_sample_logprobs = len(logprobs)
assert num_new_tokens == len_sample_logprobs, (
f"Request {req_id} has {num_new_tokens}"
" completion tokens but has"
f" {len_sample_logprobs} sample logprobs.")
ref_cumulative_logprob = 0.0
for idx, (sampled_token,
pos_logprob_dict) in enumerate(zip(new_tokens,
logprobs)):
# Break out the reference log probability value &
# logprob token id tensors associated with this
# position in the completion. Also break out the
# sampled token ranks
(ref_pos_logprob_toks, ref_pos_logprob_vals,
ref_sampled_token_rank) = ref_logprobs[idx]
# For each position in the completion sequence,
# ensure the actual sampled token is among the
# logprobs
assert sampled_token in pos_logprob_dict, (
f"Sampled token {sampled_token} not"
f" present in logprob at index {idx}")
# Validate number of sample logprobs
num_lp_toks = len(pos_logprob_dict)
assert (num_lp_toks == num_sample_logprobs
or num_lp_toks == num_sample_logprobs +
1), ("Valid numbers of sample logprobs are"
f" {num_sample_logprobs} or"
f" {num_sample_logprobs+1} but"
f" {num_lp_toks} logprobs found at"
f" position {idx}. Logprobs dict:"
f" {pos_logprob_dict}")
# Validate sampled token logprob rank
smp_lp = pos_logprob_dict[sampled_token]
smp_lp_rank = smp_lp.rank
assert (ref_sampled_token_rank == smp_lp_rank), (
"Sampled token logprob rank"
f" {smp_lp_rank} does not match"
" correct value"
f" {ref_sampled_token_rank}"
f" in Logprob {smp_lp}")
# Validate that the logprob processor yields
# the correct log probabilities and valid
# rankings
rank_one_appears = False
for jdx in range(1, len(ref_pos_logprob_toks)):
# Iterate over the (logprob val,logprob tok id)
# pairs expected by the test fixture at this
# position in the completion.
ref_lp_val = ref_pos_logprob_vals[jdx]
ref_tok_id = ref_pos_logprob_toks[jdx]
assert ref_tok_id in pos_logprob_dict, (
f"Expected token {ref_tok_id} to be"
f" in logprob dict but it is not.")
# Extract actually-generated logprob
# info
lp = pos_logprob_dict[ref_tok_id]
lp_val = lp.logprob
lp_rank = lp.rank
# A "top" (rank 1) logprob must be
# present
rank_one_appears = (True
if lp_rank == 1 else rank_one_appears)
# Rank must be >= 1
assert lp_rank >= 1, (f"Logprob {lp} has invalid"
f" rank {lp_rank} < 1."
f" Logprob dict: {pos_logprob_dict}")
# Validate log probability
assert math.isclose(lp_val, ref_lp_val), (
f"Token id {ref_tok_id} appears in logprobs dict"
f" at position {idx} in completion with log"
f" probability {lp_val} but {ref_lp_val} was"
f" expected. Logprob: {lp}")
assert rank_one_appears, (f"No Logprob has rank 1"
" in the following Logprob"
f" dict: {pos_logprob_dict}")
# Validate logprobs detokenization
for lp_tok in pos_logprob_dict:
# Confirm that sample logprob decoded token matches
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[lp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(
dtv.tokenizer, lp_tok)
assert decoded_token == ref_decoded_token, (
f"Sampled logprob token id {lp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})")
ref_cumulative_logprob += pos_logprob_dict[
sampled_token].logprob
# Assert that cumulative logprobs are correct
assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
else:
# Sample logprobs disabled for this request
assert logprobs is None
assert cumulative_logprob is None
if num_prompt_logprobs is not None:
# Validate prompt logprobs
assert prompt_logprobs is not None, (
f"Request {req_id} requires prompt"
" logprobs but prompt logprobs are"
" None.")
# Require num prompt tokens to match num
# prompt logprobs
num_prompt_tokens = len(prompt_token_ids)
len_prompt_logprobs = len(prompt_logprobs)
assert num_prompt_tokens == len_prompt_logprobs, (
f"Request {req_id} has {num_prompt_tokens}"
" prompt tokens but has"
f" {len_prompt_logprobs} prompt logprobs.")
# First prompt logprob is None
first_plp_dict = prompt_logprobs[0]
assert first_plp_dict is None, (
f"Request {req_id} first prompt logprob"
f" should be None but has following value"
f" instead: {first_plp_dict}")
# Break out the reference prompt log prob value &
# logprob token id matrices for the whole prompt.
# Also break out the prompt token rank vector
(ref_prompt_logprob_toks, ref_prompt_logprob_vals,
ref_prompt_token_ranks) = ref_prompt_logprobs
for idx, (prompt_token, pos_logprob_dict) in enumerate(
zip(prompt_token_ids[1:], prompt_logprobs[1:])):
# Break out the reference prompt log prob value
# vector, prompt logprob token id vector, and
# prompt token rank at the current position.
(ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
ref_prompt_logprob_vals[idx, :],
ref_prompt_token_ranks[idx])
# For each position in the prompt sequence,
# ensure the actual prompt token is among the
# logprobs
assert prompt_token in pos_logprob_dict, (
f"Prompt token {prompt_token} not"
f" present in logprob at index {idx}")
# Validate number of prompt logprobs
num_plp_toks = len(pos_logprob_dict)
assert (num_plp_toks == num_prompt_logprobs
or num_plp_toks == num_prompt_logprobs +
1), ("Valid numbers of prompt logprobs are"
f" {num_prompt_logprobs} or"
f" {num_prompt_logprobs+1} but"
f" {num_plp_toks} logprobs found at"
f" position {idx}. Logprobs dict:"
f" {pos_logprob_dict}")
# Validate prompt token logprob rank
prmpt_tok_lp = pos_logprob_dict[prompt_token]
prmpt_tok_lp_rank = prmpt_tok_lp.rank
ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
"Prompt token logprob rank"
f" {prmpt_tok_lp_rank} does not match"
" correct value"
f" {ref_prmpt_tok_lp_rank}"
f" in Logprob {prmpt_tok_lp}")
# Validate that the logprob processor yields
# the correct prompt log probs and valid
# rankings
rank_one_appears = False
for jdx in range(1, len(ref_pos_prompt_logprob_toks)):
# Iterate over the (logprob val,logprob tok id)
# pairs expected by the test fixture at this
# position in the completion.
ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx])
ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
assert ref_tok_id in pos_logprob_dict, (
f"Expected token {ref_tok_id} to be"
f" in logprob dict but it is not.")
# Extract actually-generated logprob
# info
plp = pos_logprob_dict[ref_tok_id]
plp_val = plp.logprob
plp_rank = plp.rank
# A "top" (rank 1) logprob must be
# present
rank_one_appears = (True
if plp_rank == 1 else rank_one_appears)
# Rank must be >= 1
assert plp_rank >= 1, (
f"Logprob {plp} has invalid"
f" rank {plp_rank} < 1."
f" Logprob dict: {pos_logprob_dict}")
# Validate log probability
assert math.isclose(plp_val, ref_plp_val), (
f"Token id {ref_tok_id} appears in logprobs dict"
f" at position {idx} in completion with log"
f" probability {plp_val} but {ref_plp_val} was"
f" expected. Logprob: {plp}")
assert rank_one_appears, (f"No Logprob has rank 1"
" in the following Logprob"
f" dict: {pos_logprob_dict}")
# Validate prompt logprob detokenization
for plp_tok in pos_logprob_dict:
# Confirm that prompt logprob decoded token matches
# the logprob token id at this sequence position
decoded_token = pos_logprob_dict[plp_tok].decoded_token
ref_decoded_token = _ref_convert_id_to_token(
dtv.tokenizer, plp_tok)
assert decoded_token == ref_decoded_token, (
f"Prompt logprob token id {plp_tok} decodes to"
f" {ref_decoded_token} but Logprob decoded"
f" token is {decoded_token} instead"
f" (at position {idx})")
else:
# Prompt logprobs disabled for this request
assert prompt_logprobs is None
@pytest.mark.parametrize(
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.parametrize("num_sample_logprobs",
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
@pytest.mark.parametrize("num_prompt_logprobs",
[None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
def test_logprobs_processor(request_output_kind: RequestOutputKind,
num_sample_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None if num_sample_logprobs is None else
dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
# Make N requests.
request_id_list = [
f"request-{idx}"
for idx in range(len(dummy_test_vectors.prompt_strings))
]
requests = [
EngineCoreRequest(request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(
skip_special_tokens=False,
spaces_between_special_tokens=False,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False,
logprobs=num_sample_logprobs,
prompt_logprobs=num_prompt_logprobs,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
]
# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
gen_tokens = {}
gen_logprobs = {}
gen_prompt_logprobs = {}
gen_cumulative_logprobs = {}
while True:
# Mock output from the EngineCore.
outputs = engine_core.get_outputs()
if len(outputs) == 0:
break
# Step the logprobs processor.
processed_outputs = output_processor.process_outputs(outputs)
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
assert len(requests_to_abort) == 0
# Update tracking.
for request_output in request_outputs:
request_id = request_output.request_id
new_tokens = request_output.outputs[0].token_ids
prompt_logprobs = request_output.prompt_logprobs
logprobs = request_output.outputs[0].logprobs
gen_cumulative_logprobs[request_id] = request_output.outputs[
0].cumulative_logprob
if request_id not in gen_logprobs:
# Start tracking sample and prompt logprobs for this request
gen_tokens[request_id] = new_tokens
gen_logprobs[request_id] = logprobs
gen_prompt_logprobs[request_id] = prompt_logprobs
else:
# Extend logprobs tracker
gen_tokens[request_id].extend(new_tokens)
lp = gen_logprobs[request_id]
plp = gen_prompt_logprobs[request_id]
if lp:
lp.extend(logprobs)
if plp:
plp.extend(prompt_logprobs)
# Confirmed tracked logprobs match what we expect
_validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
gen_cumulative_logprobs, dummy_test_vectors,
request_id_list, num_sample_logprobs,
num_prompt_logprobs)
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
def test_stop_string(include_stop_str_in_output: bool):
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
@pytest.mark.parametrize("num_sample_logprobs",
[None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
@pytest.mark.parametrize("num_prompt_logprobs",
[None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
def test_stop_string(include_stop_str_in_output: bool,
num_sample_logprobs: Optional[int],
num_prompt_logprobs: Optional[int], dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs else None,
prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
if num_prompt_logprobs else None)
# Make N requests.
request_id_list = [
f"request-{idx}"
for idx in range(len(dummy_test_vectors.prompt_strings))
]
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
......@@ -169,9 +510,11 @@ def test_stop_string(include_stop_str_in_output: bool):
output_kind=RequestOutputKind.DELTA,
stop=STOP_STRINGS,
include_stop_str_in_output=include_stop_str_in_output,
)) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
logprobs=num_sample_logprobs,
prompt_logprobs=num_prompt_logprobs,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
]
# Add requests to the detokenizer.
......@@ -179,6 +522,10 @@ def test_stop_string(include_stop_str_in_output: bool):
output_processor.add_request(request)
gen_strings = {}
gen_tokens = {}
gen_logprobs = {}
gen_prompt_logprobs = {}
gen_cumulative_logprobs = {}
aborted = []
while True:
# Mock output from the EngineCore.
......@@ -202,14 +549,29 @@ def test_stop_string(include_stop_str_in_output: bool):
request_id = request_output.request_id
new_text = request_output.outputs[0].text
new_tokens = request_output.outputs[0].token_ids
prompt_logprobs = request_output.prompt_logprobs
logprobs = request_output.outputs[0].logprobs
gen_cumulative_logprobs[request_id] = request_output.outputs[
0].cumulative_logprob
if request_id not in gen_strings:
gen_strings[request_id] = new_text
gen_tokens[request_id] = new_tokens
gen_logprobs[request_id] = logprobs
gen_prompt_logprobs[request_id] = prompt_logprobs
else:
gen_strings[request_id] += new_text
gen_tokens[request_id].extend(new_tokens)
lp = gen_logprobs[request_id]
plp = gen_prompt_logprobs[request_id]
if lp:
lp.extend(logprobs)
if plp:
plp.extend(prompt_logprobs)
# Confirmed tracked values matches what we expected.
for idx, (ref_gen_str,
stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
for idx, (ref_gen_str, stop_str) in enumerate(
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
# Request should be aborted.
request_id = f"request-{idx}"
......@@ -230,13 +592,21 @@ def test_stop_string(include_stop_str_in_output: bool):
assert gen_str == ref_str_exc_stop, (
f"{gen_str=}, {ref_str_exc_stop=}")
# Confirmed tracked logprobs match what we expect
_validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
gen_cumulative_logprobs, dummy_test_vectors,
request_id_list, num_sample_logprobs,
num_prompt_logprobs)
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
def test_iteration_stats():
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
engine_core = MockEngineCore(GENERATION_TOKENS)
def test_iteration_stats(dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
log_stats=True)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()
# Make N requests.
requests = [
......@@ -251,31 +621,35 @@ def test_iteration_stats():
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(),
) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
]
# Add all requests except one to the OutputProcessor.
num_active = len(GENERATION_TOKENS) - 1
num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]:
output_processor.add_request(request)
inactive_request = requests[num_active]
# First iteration has 2 prefills.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = sum(
[len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
iteration_stats = IterationStats()
output_processor.process_outputs(outputs, engine_core_timestamp,
iteration_stats)
total_prompt_tokens = sum([
len(prompt_tokens)
for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
iteration_stats = IterationStats()
output_processor.process_outputs(outputs, engine_core_timestamp,
iteration_stats)
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
......@@ -284,17 +658,19 @@ def test_iteration_stats():
output_processor.add_request(inactive_request)
num_active += 1
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
iteration_stats = IterationStats()
output_processor.process_outputs(outputs, engine_core_timestamp,
iteration_stats)
total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
iteration_stats = IterationStats()
output_processor.process_outputs(outputs, engine_core_timestamp,
iteration_stats)
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
# SPDX-License-Identifier: Apache-2.0
import random
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.engine.arg_utils import EngineArgs
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.v1.engine import EngineCoreOutput, FinishReason
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
GeneralTokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
# Number of sample logprobs to request when testing sample logprobs
NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
# Number of prompt logprobs to request when testing prompt logprobs
NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
FULL_STRINGS = [
"My name is Robert from Neural Magic and I love working on vLLM so much!",
"Red Hat is the best open source company by far across Linux, K8s, and AI.",
"Nick is the name of my brother in addition to my colleague from Red Hat.",
]
STOP_STRINGS = ["I love working on", "company by far", "brother in"]
PROMPT_LEN = 5
PLP_APC_UNSUPPORTED_MSG = ("Prefix caching with prompt logprobs not yet "
"supported on VLLM V1.")
random.seed(42)
def _create_random_top_logprob_test_vector(
num_logprobs: int,
lower: float,
upper: float,
) -> torch.Tensor:
"""Create a random vector of top logprob float values.
Use to create fake sample logprobs for testing.
Note that a real production scenario would require
logprobs to be sorted in descending order, something
which is omitted in this function.
Args:
num_logprobs: number of top logprobs
lower: lower range of logprob float values
upper: upper range of logprob float values
Returns:
1D length-`num_logprobs` torch Tensor of float logprob values
"""
return torch.rand(num_logprobs) * (upper - lower) + lower
def _create_random_top_logprob_test_matrix(
shape: Tuple,
lower: float,
upper: float,
) -> torch.Tensor:
"""Create a random matrix of top logprob float values.
Use to create fake prompt logprobs for testing.
Note that a real production scenario would require
logprobs to be sorted in descending order along rows,
something which is omitted in this function.
Args:
shape: (num_tokens,num_logprobs) tuple representing
matrix shape
lower: lower range of logprob float values
upper: upper range of logprob float values
Returns:
2D num_tokens x num_logprobs torch Tensor of float logprob values
"""
return torch.rand(*shape) * (upper - lower) + lower
def _create_random_top_token_test_vector(
num_logprobs: int,
lower: int,
upper: int,
sampled_token_id: int,
adjust_num_logprobs: bool = True) -> Tuple[torch.Tensor, int]:
"""Create a random vector of top logprob token indices
Use to create fake sample logprobs for testing. The sampled token
ID must always be one of the top logprobs, which this dummy test
vector generator enforces. OpenAI API
compatible engines must be able to return an additional sample
logprob for the sampled token if the sampled token was not
among the top sample logprobs; `adjust_num_logprobs` emulates
this behavior by increasing the vector length by 1 if
`adjust_num_logprobs` is set.
Args:
num_logprobs: number of top logprobs
lower: lower range of token ids
upper: upper range of token ids
sampled_token_id: the token actually sampled
adjust_num_logprobs: if True, emulate situation where sampled
token logprob must be injected into top
logprobs
Returns:
1D length-x torch Tensor of token ids where x is
`num_logprobs+1` if `adjust_num_logprobs` and
`num_logprobs` otherwise
sampled_token_rank: the rank of sampled_token_id in the vocab
vector when sorted in descending order by
logprob
"""
# Calculate the final number of logprobs required
total_logprobs = num_logprobs + 1 if adjust_num_logprobs else num_logprobs
# Generate random indices using torch
choice_tensor = torch.randperm(upper - lower)[:total_logprobs] + lower
# Ensure the sampled token ID is included in the tensor
choice_tensor[0] = sampled_token_id
# Check if the sampled_token_id occurs in choice_tensor[1:]
if sampled_token_id in choice_tensor[1:]:
sampled_token_rank = (choice_tensor[1:] == sampled_token_id).nonzero(
as_tuple=True)[0].item()
else:
# If not found, assign a random int between num_logprobs and 50700
sampled_token_rank = random.randint(num_logprobs, 50700)
return choice_tensor, sampled_token_rank
def _create_random_top_token_test_matrix(
shape: Tuple[int, int],
lower: int,
upper: int,
tokens_list: List[int],
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Create a random matrix of top logprob token indices
Use to create fake prompt logprobs for testing.
Token ids are generated randomly and sampled without
replacement.
Args:
shape: (num_tokens, num_logprobs) tuple representing
matrix shape
lower: lower range of token ids
upper: upper range of token ids
Returns:
Tuple containing:
- 2D num_tokens x num_logprobs+1 torch Tensor of token ids
- 1D tensor of ranks of prompt tokens in their respective
rows, or random values
"""
num_elements = shape[0] * shape[1]
choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
matrix = torch.cat(
(torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
choice_tensor.view(shape)),
dim=1)
# Initialize the tensor for storing the ranks
prompt_token_ranks = torch.empty(shape[0], dtype=torch.int)
# Iterate over each row to check presence of
# tokens_list[rdx] and determine its index
for rdx in range(shape[0]):
row = matrix[rdx,
1:] # Skip the first column as it contains the token list
token_index = (row == tokens_list[rdx]).nonzero(as_tuple=True)[0]
if token_index.numel() > 0:
prompt_token_ranks[rdx] = token_index.item()
else:
prompt_token_ranks[rdx] = random.randint(shape[1], 50700)
return matrix, prompt_token_ranks
def decode_token(
tok_id: int,
tokenizer: PreTrainedTokenizer,
) -> str:
"""Reproduce the process of detokenizing a token for testing purposes.
Args:
tok_id: token id to detokenize
tokenizer: tokenizer to use for detokenization
Returns:
string representation of token
"""
return tokenizer.convert_ids_to_tokens(tok_id)
def generate_dummy_sample_logprobs(
sampled_tokens_list: List,
num_logprobs: int,
tokenizer: PreTrainedTokenizer,
) -> List[Tuple[List[int], List[float], int]]:
"""Generate dummy sample logprobs
Generate a test data structure which imitates the list of sample logprobs
which would be assembled in the engine core during decode phase.
Args:
sampled_tokens_list: list of sampled tokens
num_logprobs: return `num_logprobs` or `num_logprobs+1` logprobs per token
tokenizer: model tokenizer to use for detokenization
Returns
List of (top token ids vector, logprobs vector, sampled token rank)
Python lists tuples; in each tuple the logprobs and top token ids
vectors have the same length which is either `num_logprobs` or
`num_logprobs+1`. Sampled token rank is the rank (index+1) of the
sampled token within the vocab vector when sorted by logprob in
descending order.
"""
res = []
for sampled_token_id in sampled_tokens_list:
(
token_vector,
sampled_token_rank,
) = _create_random_top_token_test_vector(num_logprobs, 0,
len(tokenizer.vocab) - 1,
sampled_token_id)
res.append(
(token_vector,
_create_random_top_logprob_test_vector(num_logprobs + 1, -100,
0), sampled_token_rank))
# Convert tensors in the list tuples to Python lists
res_list_format = [
(log_probs_tensor.tolist(), token_ids_tensor.tolist(),
sampled_token_rank)
for log_probs_tensor, token_ids_tensor, sampled_token_rank in res
]
return res_list_format
def generate_dummy_prompt_logprobs_tensors(
prompt_tokens_list: List,
num_logprobs: int,
tokenizer: PreTrainedTokenizer,
) -> LogprobsTensors:
"""Generate dummy prompt logprobs tensors
Generate a test data structure which imitates the torch Tensors of prompt
logprobs which would be assembled in the engine core during chunked
prefill.
Args:
prompt_tokens_list: list of prompt tokens
num_logprobs: return `num_logprobs` logprobs per token
tokenizer: model tokenizer to use for detokenization
Returns
Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor,
where both matrices have dimensions
num_prompt_tokens x num_logprobs
"""
# For now, assume the whole prompt is processed in one chunk; thus,
# the number of non-`None` prompt logprobs is `len(prompt_tokens_list)-1`.
# Prior to injecting `None` at the beginning of prompt logprobs (which
# happens later in the detokenizer, not here), the prompt logprobs in
# the ith position are predicting the probability distribution of the
# prompt token in (i+1)st position. Thus, we concat
# `prompt_tokens_list[1:]` to the dummy token ids, just as the engine
# would.
num_prompt_logprobs = len(prompt_tokens_list) - 1
(
token_vector,
prompt_token_ranks,
) = _create_random_top_token_test_matrix(
(num_prompt_logprobs, num_logprobs), 0,
len(tokenizer.vocab) - 1, prompt_tokens_list[1:])
return LogprobsTensors(
token_vector,
_create_random_top_logprob_test_matrix(
(num_prompt_logprobs, num_logprobs + 1), -100, 0),
prompt_token_ranks)
@dataclass
class DummyOutputProcessorTestVectors:
"""Dummy test vectors for output processor tests"""
tokenizer: GeneralTokenizerType
tokenizer_group: BaseTokenizerGroup
vllm_config: EngineArgs
full_tokens: List[List[int]] # Prompt + generated tokens
prompt_tokens: List[List[int]]
generation_tokens: List[List[int]]
# Each request is associated with a tuple of
# (top tokens, top logprobs, ranks) prompt logprobs tensors
prompt_logprobs: List[LogprobsTensors]
# Each request is associated with a sample logprobs; a request's
# sample logprobs are a list of (top tokens, top logprobs, ranks)
# sample logprobs tensors at each sequence position
generation_logprobs: List[List[Tuple[List[int], List[float], int]]]
prompt_strings: List[str]
prompt_strings_len: List[int]
generation_strings: List[str]
class MockEngineCore:
"""Mock engine core outputs form premade tokens lists."""
def __init__(
self,
tokens_list: List[List[int]],
# For each request, for each sampled token offset,
# a tuple of
# (list of topk token ids, list of sample logprob vals, rank)
generated_logprobs_raw: Optional[List[List[Tuple[List[int],
List[float],
int]]]] = None,
# For each request, a tuple of
# (prompt logprob val matrix, prompt logprob tok id matrix);
# each matrix has dimensions
# (num prompt toks) x (num prompt logprobs+1)
prompt_logprobs_raw: Optional[List[LogprobsTensors]] = None,
) -> None:
self.tokens_list = tokens_list
self.current_idx = 0
self.generated_logprobs_raw = generated_logprobs_raw
self.do_logprobs = generated_logprobs_raw is not None
self.prompt_logprobs_raw = prompt_logprobs_raw
self.do_prompt_logprobs = prompt_logprobs_raw is not None
def get_outputs(self) -> List[EngineCoreOutput]:
do_logprobs = self.do_logprobs
do_prompt_logprobs = self.do_prompt_logprobs
token_idx = self.current_idx
outputs = []
for req_idx, token_ids in enumerate(self.tokens_list):
if len(token_ids) > token_idx:
if do_logprobs:
assert self.generated_logprobs_raw is not None
(logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
self.generated_logprobs_raw[req_idx][token_idx])
logprobs = LogprobsLists(
[logprobs_token_ids_],
[logprobs_],
[sampled_token_ranks_],
)
else:
logprobs = None
if do_prompt_logprobs:
if self.current_idx == 0:
assert self.prompt_logprobs_raw is not None
prompt_logprobs = self.prompt_logprobs_raw[req_idx]
else:
prompt_logprobs = None
else:
prompt_logprobs = None
output = EngineCoreOutput(
request_id=f"request-{req_idx}",
new_token_ids=[token_ids[token_idx]],
new_logprobs=logprobs,
new_prompt_logprobs_tensors=prompt_logprobs,
)
if token_idx == len(token_ids) - 1:
output.finish_reason = FinishReason.STOP
outputs.append(output)
self.current_idx += 1
return outputs
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture
def sample_prompts():
return [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
@pytest.fixture
def sample_token_ids():
return [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
@pytest.fixture
def sample_regex():
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work_history"]
}
@pytest.fixture
def sample_complex_json_schema():
return {
"type": "object",
"properties": {
"score": {
"type": "integer",
"minimum": 0,
"maximum": 100 # Numeric range
},
"grade": {
"type": "string",
"pattern": "^[A-D]$" # Regex pattern
},
"email": {
"type": "string",
"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
},
"tags": {
"type": "array",
"items": {
"type": "string",
"pattern":
"^[a-z]{1,10}$" # Combining length and pattern restrictions
}
}
},
"required": ["score", "grade", "email", "tags"]
}
@pytest.fixture
def sample_definition_json_schema():
return {
'$defs': {
'Step': {
'properties': {
'explanation': {
'title': 'Explanation',
'type': 'string'
},
'output': {
'title': 'Output',
'type': 'string'
}
},
'required': ['explanation', 'output'],
'title': 'Step',
'type': 'object'
}
},
'properties': {
'steps': {
'items': {
'$ref': '#/$defs/Step'
},
'title': 'Steps',
'type': 'array'
},
'final_answer': {
'title': 'Final Answer',
'type': 'string'
}
},
'required': ['steps', 'final_answer'],
'title': 'MathReasoning',
'type': 'object'
}
@pytest.fixture
def sample_guided_choice():
return [
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
"Ruby", "Swift", "Kotlin"
]
@pytest.fixture
def sample_sql_statements():
return ("""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
""")
# SPDX-License-Identifier: Apache-2.0
import re
from typing import Dict, List, Optional
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from openai import BadRequestError
from tests.utils import RemoteOpenAIServer
from vllm.transformers_utils.tokenizer import get_tokenizer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager"
]
@pytest.fixture(scope="module",
params=[["--no-enable-prefix-caching"],
[
"--no-enable-prefix-caching",
"--disable-frontend-multiprocessing"
]])
def server(default_server_args, request):
if request.param:
default_server_args.extend(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_single_completion(client: openai.AsyncOpenAI,
model_name: str) -> None:
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 1
assert completion.choices[0].prompt_logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=None,
)
choice = completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=0,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=5,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
model_name: str) -> None:
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
)
...
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
stream = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True,
)
async for chunk in stream:
...
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
(MODEL_NAME, 0),
(MODEL_NAME, 1),
(MODEL_NAME, None)])
async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
model_name: str,
prompt_logprobs: Optional[int]):
params: Dict = {
"prompt": ["A robot may not injure another robot", "My name is"],
"model": model_name,
}
if prompt_logprobs is not None:
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
if prompt_logprobs is not None and prompt_logprobs < 0:
with pytest.raises(BadRequestError):
await client.completions.create(**params)
else:
completion = await client.completions.create(**params)
if prompt_logprobs is not None:
assert completion.choices[0].prompt_logprobs is not None
assert len(completion.choices[0].prompt_logprobs) > 0
assert completion.choices[1].prompt_logprobs is not None
assert len(completion.choices[1].prompt_logprobs) > 0
else:
assert completion.choices[0].prompt_logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_completion_streaming(client: openai.AsyncOpenAI,
model_name: str) -> None:
prompt = "What is an LLM?"
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_completion_stream_options(client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats":
False,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats":
False,
})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is not None
assert chunk.usage.prompt_tokens > 0
assert chunk.usage.completion_tokens > 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if chunk.choices[0].finish_reason is not None:
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options=
# {"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options=
# {"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": None})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
# test both text and token IDs
for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
# test simple list
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
# test n = 2
batch = await client.completions.create(
model=model_name,
prompt=prompts,
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but
# not necessary for official client.
use_beam_search=True),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[
1].text, "beam search should be different"
assert batch.choices[0].text == batch.choices[
2].text, "two copies of the same prompt should be the same"
assert batch.choices[1].text == batch.choices[
3].text, "two copies of the same prompt should be the same"
# test streaming
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
completion = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
echo=True,
logprobs=logprobs_arg)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
list) else prompt
assert re.search(r"^" + prompt_text, completion.choices[0].text)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) > 5
assert (len(logprobs.token_logprobs) > 5
and logprobs.token_logprobs[0] is None)
assert (len(logprobs.top_logprobs) > 5
and logprobs.top_logprobs[0] is None)
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg,
1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5
# SPDX-License-Identifier: Apache-2.0
import itertools
from typing import List, Tuple
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from tests.v1.sample.utils import (
assert_incr_detok_str_matches_non_incr_detok_str,
compute_correct_cumulative_logprob, get_test_batch)
from vllm import SamplingParams
from ...conftest import VllmRunner
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
DTYPE = "half"
@pytest.fixture(scope="module")
def vllm_model(vllm_runner):
with vllm_runner(
MODEL,
dtype=DTYPE,
max_logprobs=7,
# Very small number of batched tokens to ensure
# that we test chunking.
max_num_batched_tokens=16,
max_num_seqs=16,
max_model_len=128,
enforce_eager=True,
#TODO: enable this once we support it for
# prompt logprobs.
enable_prefix_caching=False,
gpu_memory_utilization=0.5,
) as vllm_model:
yield vllm_model
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL, dtype=DTYPE) as hf_model:
yield hf_model
def _repeat_logprob_config(
test_prompts,
logprob_prompt_logprob_list: List[Tuple],
) -> List[Tuple]:
"""Ensure each test prompt has a logprob config.
A logprob config specifies the optional (i.e.
may-be-`None`) number of sample logprobs and
the optional number of prompt logprobs.
If more test prompts than logprob configs are
provided, the provided logprob configs are
tiled to match the number of test prompts.
If fewer test prompts than logprob configs
are provided, the list of logprob configs
is truncated to match the number of test
prompts.
Otherwise, the list of logprob configs
is returned as-is.
Args:
test_prompts: list of prompts under test
logprob_prompt_logprob_list: list of
(optional num sample logprob,
optional num prompt logprob)
tuples
Returns:
List of
(optional num sample logprob,optional num prompt logprob)
tuples which is either identical to
`logprob_prompt_logprob_list`, or else repeats
`logprob_prompt_logprob_list` enough times to match the
number of `test_prompts`, or else is truncated to match
the number of `test_prompts`
"""
num_test_prompts = len(test_prompts)
# Make sure there is a logprobs configuration for each test prompt
logprob_prompt_logprob_list = list(
itertools.islice(itertools.cycle(logprob_prompt_logprob_list),
num_test_prompts))
# Now the number of prompts should match the number of sample params combos
assert num_test_prompts == len(logprob_prompt_logprob_list)
return logprob_prompt_logprob_list
def _test_case_get_logprobs_and_prompt_logprobs(
hf_model,
vllm_model,
batch_logprobs_composition: str,
temperature: float,
example_prompts,
) -> None:
test_prompts = example_prompts
max_tokens = 5
hf_outputs = hf_model.generate_greedy(
test_prompts,
max_tokens=max_tokens,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984)
for num_lp, num_plp in logprob_prompt_logprob_list
]
vllm_results = vllm_model.model.generate(
test_prompts, sampling_params=vllm_sampling_params)
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
vllm_results, hf_logprobs, hf_outputs,
logprob_prompt_logprob_list):
# Extract request-level (prompt)logprobs config
num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
# Test whether sampled token output is consistent between vLLM and HF
# vLLM prompt+completion should match HF output
if temperature == 0.0:
assert (vllm_result.prompt_token_ids +
vllm_result.outputs[0].token_ids == hf_output[0])
else:
# Sampled tokens won't match if not greedy
assert (vllm_result.prompt_token_ids == hf_output[0]
[:len(vllm_result.prompt_token_ids)])
# Validate sample logprobs
if num_top_logprobs is not None:
assert num_top_logprobs is not None
# Confirm that the structure of the sample logprobs in the result is
# correct
assert vllm_result.outputs[0].logprobs is not None
assert len(vllm_result.outputs[0].logprobs) == max_tokens
for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
vllm_result.outputs[0].token_ids):
assert logprobs is not None
# Confirm that the output token appears among the logprobs
assert token_id in logprobs
token_in_topk = logprobs[token_id].rank <= num_top_logprobs
# If the output token is not included in the top K
# logprob, it can return 1 more data
if token_in_topk and num_top_logprobs != 0:
assert len(logprobs) == num_top_logprobs
else:
assert len(logprobs) == num_top_logprobs + 1
if num_top_logprobs > 0:
# We should have an entry for each of the topk ranks
all_ranks = {lp.rank for lp in logprobs.values()}
assert all(r in all_ranks
for r in range(1, num_top_logprobs + 1))
output_text = vllm_result.outputs[0].text
output_string_from_most_likely_tokens_lst: List[str] = []
for top_logprobs in vllm_result.outputs[0].logprobs:
top_logprob = next(iter(top_logprobs.values()))
output_string_from_most_likely_tokens_lst.append(
top_logprob.decoded_token)
output_string_from_most_likely_tokens = "".join(
output_string_from_most_likely_tokens_lst)
assert_incr_detok_str_matches_non_incr_detok_str(
output_text, output_string_from_most_likely_tokens,
"The output text from the top logprob for each token "
"position should be the same as the output text in the "
"result.")
# Compare vLLM sample logprobs to HF
vllm_sample_logprobs = vllm_result.outputs[0].logprobs
for i, top_logprobs in enumerate(vllm_sample_logprobs):
for token_id, sample_logprob in top_logprobs.items():
if temperature == 0.0 or i == 0:
logprob = sample_logprob.logprob
torch.testing.assert_close(
logprob,
hf_logprob[i][-1][token_id].item(),
atol=1e-2,
rtol=1e-2)
assert isinstance(
sample_logprob.decoded_token,
str), ("The token should be decoded by the time it is"
" returned to the user.")
# At this point we know the sample logprobs are correct for this
# request. Validate that cumulative_logprob is actually the sum.
# For each request, assert that the returned cumulative logprob
# matches the correct value, which is computed below.
torch.testing.assert_close(
vllm_result.outputs[0].cumulative_logprob,
compute_correct_cumulative_logprob(vllm_result.outputs[0]),
atol=1e-6,
rtol=1e-6)
else:
# Logprobs disabled for this request; should be None
assert vllm_result.outputs[0].logprobs is None
# Validate prompt logprobs
if num_top_prompt_logprobs is not None:
# Confirm that structure of prompt logprobs in result is correct
assert vllm_result.prompt_logprobs is not None
# - The first prompt logprob is always None
assert vllm_result.prompt_logprobs[0] is None
# - Prompt logprobs are returned for all indices in
# the prompt
assert len(vllm_result.prompt_logprobs) == len(
vllm_result.prompt_token_ids)
for prompt_logprobs, prompt_token_id in zip(
vllm_result.prompt_logprobs[1:],
vllm_result.prompt_token_ids[1:]):
assert prompt_logprobs is not None
# Confirm that the prompt token appears among the logprobs
assert prompt_token_id in prompt_logprobs
token_in_topk = prompt_logprobs[
prompt_token_id].rank <= num_top_prompt_logprobs
# If the prompt token is not included in the top K
# logprob, it can return 1 more data
if token_in_topk and num_top_prompt_logprobs != 0:
assert len(prompt_logprobs) == num_top_prompt_logprobs
else:
assert len(prompt_logprobs) == num_top_prompt_logprobs + 1
if num_top_prompt_logprobs > 0:
# We should have an entry for each of the topk ranks
all_ranks = {lp.rank for lp in prompt_logprobs.values()}
assert all(r in all_ranks
for r in range(1, num_top_prompt_logprobs + 1))
# Compare prompt logprobs to HF
# The first prompt logprob is always None, so we compare it from
# 1:.
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
for token_id, logprob in vllm_prompt_logprob_dict.items():
torch.testing.assert_close(
logprob.logprob,
hf_logprob[0][i][token_id].item(),
atol=2e-2,
rtol=2e-2)
else:
assert vllm_result.prompt_logprobs is None
#@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("batch_logprobs_composition",
["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
@pytest.mark.parametrize("temperature", [0.0, 2.0])
def test_get_logprobs_and_prompt_logprobs(
hf_model,
vllm_model,
batch_logprobs_composition: str,
temperature: float,
example_prompts,
) -> None:
"""Test V1 Engine logprobs & prompt logprobs
Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
settings and validate that
* The generated logprobs and prompt logprobs are consistent with the
configuration settings, in terms of whether or not the logprobs
(of either type) were requested and how many were requested
* The generated logprobs are consistent with the generated tokens
* The generated (prompt)logprobs are consistent with HuggingFace
(prompt)logprobs, as a reference
batch_logprobs_composition controls the logprobs configurations for
requests in the batch under test.
Args:
hf_model
vllm_model
batch_logprobs_composition: logprobs configuration for test batch
example_prompts
monkeypatch
"""
_test_case_get_logprobs_and_prompt_logprobs(
hf_model=hf_model,
vllm_model=vllm_model,
batch_logprobs_composition=batch_logprobs_composition,
temperature=temperature,
example_prompts=example_prompts)
def test_max_logprobs(monkeypatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
Args:
monkeypatch
"""
override_backend_env_variable(monkeypatch, "FLASH_ATTN")
runner = VllmRunner("facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
max_model_len=256)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
monkeypatch: supports editing env vars and rolling back changes
after the test
"""
max_tokens = 5
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
logprobs=None,
prompt_logprobs=None,
temperature=0.0)
results_logprobs_none = vllm_model.model.generate(
example_prompts, sampling_params=sampling_params_logprobs_none)
for i in range(len(results_logprobs_none)):
# Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch):
"""Engine should return sampled token and prompt token logprobs
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
monkeypatch: supports editing env vars and rolling back changes
after the test
"""
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
logprobs=0,
prompt_logprobs=0,
temperature=0.0)
results_logprobs_zero = vllm_model.model.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero)
for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each
# sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[
0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each
# prompt token
assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs)
# SPDX-License-Identifier: Apache-2.0
import lm_eval
from ...utils import RemoteOpenAIServer
# arc-easy uses prompt_logprobs=1, logprobs=1
TASK = "arc_easy"
FILTER = "acc_norm,none"
RTOL = 0.03
EXPECTED_VALUE = 0.62
# FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
SERVER_ARGS = [
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
]
NUM_CONCURRENT = 100
def test_prompt_logprobs_e2e():
results = lm_eval.simple_evaluate(model="vllm",
model_args=MODEL_ARGS,
tasks=TASK,
batch_size="auto")
measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
def test_promt_logprobs_e2e_server():
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"
model_args = (
f"model={MODEL},"
f"base_url={url},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)
measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
import torch
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
@pytest.fixture
def sampler():
return RejectionSampler()
def create_logits_tensor(token_ids: List[int],
vocab_size: int = 100) -> torch.Tensor:
"""Helper function to create logits tensor that
will produce desired token ids on argmax"""
logits = torch.full((len(token_ids), vocab_size), -100.0).cuda()
for i, token_id in enumerate(token_ids):
logits[i, token_id] = 100.0
return logits
def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
batch_size = len(spec_tokens)
return SamplingMetadata(
temperature=torch.tensor([]),
all_greedy=True,
all_random=False,
spec_token_ids=spec_tokens,
top_p=None,
top_k=None,
min_p=torch.empty(batch_size, ),
generators={},
max_num_logprobs=0,
no_penalties=False,
prompt_token_ids=None,
frequency_penalties=torch.tensor([]),
presence_penalties=torch.tensor([]),
repetition_penalties=torch.tensor([]),
output_token_ids=[],
min_tokens={},
logit_bias=[None] * batch_size,
)
def test_perfect_match(sampler):
"""Test when output tokens perfectly match speculated tokens"""
spec_tokens = [[1, 2, 3]]
output_tokens = [1, 2, 3, 4] # 4 is the bonus token
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 2, 3, 4]],
dtype=torch.int,
device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_early_mismatch(sampler):
"""Test when there's an early mismatch in tokens"""
spec_tokens = [[1, 2, 3]]
output_tokens = [1, 5, 3, 4] # Mismatch at position 1
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_multiple_sequences(sampler):
"""Test handling multiple sequences of speculated tokens"""
spec_tokens = [[1, 2], [3]]
output_tokens = [1, 2, 5, 3, 4] # Two sequences with bonus tokens 5 and 4
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_single_token_sequence(sampler):
"""Test handling sequences with single token"""
spec_tokens = [[1]]
output_tokens = [1, 2] # Single token with bonus token 2
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_empty_sequence(sampler):
"""Test handling empty sequence of speculated tokens"""
spec_tokens: List[List[int]] = [[]]
output_tokens = [5] # Just the bonus token
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_multiple_mismatches(sampler):
"""Test handling multiple sequences with mismatches"""
spec_tokens = [[1, 2, 3], [4, 5, 6]]
output_tokens = [1, 2, 7, 6, 4, 8, 6, 9] # Mismatches in both sequences
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
[4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
@pytest.mark.parametrize(
"spec_tokens,output_tokens,expected",
[
([[1, 2]], [1, 2, 3], [[1, 2, 3]]), # Perfect match with bonus
([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]), # First mismatch
([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID],
[3, 4, 7]]), # Mixed matches
])
def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
"""Parametrized test for various matching scenarios"""
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens)
output = sampler(logits, metadata)
expected_tensor = torch.tensor(expected,
dtype=torch.int,
device=logits.device)
assert torch.equal(output.sampled_token_ids, expected_tensor)
def test_logits_shape_handling(sampler):
"""Test handling of different logits tensor shapes"""
spec_tokens = [[1, 2]]
output_tokens = [1, 2, 3]
vocab_size = 1000
metadata = create_sampling_metadata(spec_tokens)
logits = create_logits_tensor(output_tokens, vocab_size)
output = sampler(logits, metadata)
expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
assert logits.shape[-1] == vocab_size
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment