Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
# SPDX-License-Identifier: Apache-2.0
import json
import re
from copy import deepcopy
from unittest.mock import MagicMock
import pytest
from pydantic import TypeAdapter
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
ChatCompletionToolsParam)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
EXAMPLE_TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for"
", e.g. 'San Francisco'",
},
},
"required": ["city"],
"additionalProperties": False
},
},
"strict": True
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to get the forecast for, e.g. 'New York'",
},
"days": {
"type":
"integer",
"description":
"Number of days to get the forecast for (1-7)",
},
},
"required": ["city", "days"],
"additionalProperties": False
},
},
"strict": True
},
]
def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
should_match: bool):
self = MagicMock(tool_choice="required", tools=tools)
schema = ChatCompletionRequest._get_guided_json_from_tool(self)
assert isinstance(schema, dict)
# use build_regex_from_schema used in JSONLogitsProcessor to create Guide
from outlines_core.fsm.json_schema import build_regex_from_schema
regex = build_regex_from_schema(json.dumps(schema))
compiled = re.compile(regex)
matches = compiled.fullmatch(json.dumps(sample_output)) is not None
assert matches == should_match
VALID_TOOL_OUTPUTS = [
([{
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
}], True),
([{
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
}, {
"name": "get_current_weather",
"parameters": {
"city": "Berlin"
}
}], True),
([{
"name": "get_forecast",
"parameters": {
"city": "Vienna",
"days": 7
}
}], True),
([{
"name": "get_forecast",
"parameters": {
"city": "Vienna",
"days": 7
}
}, {
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
}], True),
([{
"name": "get_forecast",
"parameters": {
"city": "Vienna",
"days": 7
}
}, {
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
}, {
"name": "get_forecast",
"parameters": {
"city": "Berlin",
"days": 7
}
}, {
"name": "get_current_weather",
"parameters": {
"city": "Berlin"
}
}], True),
]
VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
@pytest.mark.parametrize(
"sample_output, should_match",
VALID_TOOL_OUTPUTS + [
(None, False),
([], False), # empty list cannot be generated
({}, False), # empty object cannot be generated
([{}], False), # list with empty object cannot be generated
(
[{ # function without required parameters cannot be generated
"name": "get_current_weather"
}],
False),
(
[{ # function without required parameters cannot be generated
"name": "get_current_weather",
"parameters": {}
}],
False),
(
[{ # function without required parameters cannot be generated
"name": "get_current_weather",
"parameters": None
}],
False),
(
{ # tool call without lists cannot be generated
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
},
False),
(
[{ # tool call with extra parameters cannot be generated
"name": "get_current_weather",
"parameters": {
"city": "Vienna",
"extra": "value"
}
}],
False),
(
[{ # tool call where parameters are first cannot be generated
"parameters": {
"city": "Vienna"
},
"name": "get_current_weather"
}],
False),
(
[{ # tool call without all required parameters cannot be generated
"name": "get_forecast",
"parameters": {
"city": "Vienna"
}
}],
False),
( # tool call with incorrect name/parameters cannot be generated
[{
"name": "get_weather",
"parameters": {
"city": "Vienna",
"days": 7
}
}], False),
( # tool call with both valid and empty function cannot be generated
[{
"name": "get_current_weather",
"parameters": {
"city": "Vienna"
}
}, {}], False),
])
def test_guided_json(sample_output, should_match):
_compile_and_check(tools=TypeAdapter(
list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
sample_output=sample_output,
should_match=should_match)
def update_parameters_none(
tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
tool.function.parameters = None
return tool
def update_parameters_empty_dict(
tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
tool.function.parameters = {}
return tool
@pytest.mark.parametrize(
"sample_output, should_match",
[
(None, False),
([], False), # empty list cannot be generated
({}, False), # empty object cannot be generated
([{}], False), # list with empty object cannot be generated
(
[{ # function without required parameters cannot be generated
"name": "get_current_weather"
}],
False),
(
[{ # function without required parameters cannot be generated
"name": "get_current_weather",
"parameters": None
}],
False),
(
[{ # function with extra parameters cannot be generated
"name": "get_current_weather",
"parameters": {
"extra": "value"
}
}],
False),
(
[{ # only function with empty parameters object is valid
"name": "get_current_weather",
"parameters": {}
}],
True),
])
@pytest.mark.parametrize(
"update_parameters",
[update_parameters_none, update_parameters_empty_dict])
def test_guided_json_without_parameters(sample_output, should_match,
update_parameters):
updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
tools = TypeAdapter(
list[ChatCompletionToolsParam]).validate_python(updated_tools)
tools = list(map(update_parameters, tools))
assert all([
tool.function.parameters is None or tool.function.parameters == {}
for tool in tools
])
_compile_and_check(tools=tools,
sample_output=sample_output,
should_match=should_match)
@pytest.mark.parametrize("output", VALID_TOOLS)
@pytest.mark.parametrize("empty_params", [False, True])
@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
def test_streaming_output_valid(output, empty_params, delta_len):
self = MagicMock()
output = deepcopy(output)
if empty_params:
output = [{"name": o["name"], "parameters": {}} for o in output]
output_json = json.dumps(output)
previous_text = ""
function_name_returned = False
messages = []
for i in range(0, len(output_json), delta_len):
delta_text = output_json[i:i + delta_len]
current_text = previous_text + delta_text
delta_message, function_name_returned = (
OpenAIServingChat.extract_tool_call_required_streaming(
self,
previous_text=previous_text,
current_text=current_text,
delta_text=delta_text,
function_name_returned=function_name_returned))
if delta_message:
messages.append(delta_message)
previous_text = current_text
assert len(messages) > 0
combined_messages = "["
for message in messages:
if message.tool_calls[0].function.name:
if len(combined_messages) > 1:
combined_messages += "},"
combined_messages += '{"name": "' + \
message.tool_calls[0].function.name + \
'", "parameters": ' + \
message.tool_calls[0].function.arguments
else:
combined_messages += message.tool_calls[0].function.arguments
combined_messages += "}]"
assert json.loads(combined_messages) == output
assert json.dumps(json.loads(combined_messages)) == output_json
......@@ -5,12 +5,8 @@ import os
import tempfile
import depyf
import pytest
from vllm.config import CompilationLevel
@pytest.mark.skip(reason="Not working; needs investigation.")
def test_tpu_compilation():
temp_dir = tempfile.mkdtemp()
with depyf.prepare_debug(temp_dir):
......@@ -22,27 +18,24 @@ def test_tpu_compilation():
"The greatest glory in living lies not in never falling,",
]
answers = [
" or, through inaction, allow a human being to come to harm.",
" what is essential is invisible to the eye.",
" but in rising every time we fall.",
" or, through inaction",
" what is essential ",
" but in rising ",
]
N = 1
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
N = 1
sampling_params = SamplingParams(temperature=0.7,
top_p=1.0,
n=N,
max_tokens=16)
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=256,
max_model_len=256,
max_num_seqs=32,
enforce_eager=False)
# disable custom dispatcher, let Dynamo takes over
# all the control
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=512,
max_num_seqs=64,
enforce_eager=True,
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
outputs = llm.generate(prompts, sampling_params)
for output, answer in zip(outputs, answers):
prompt = output.prompt
......@@ -56,16 +49,11 @@ def test_tpu_compilation():
for i, compiled_code in enumerate(compiled_codes):
print("{} file: {}".format(i + 1, compiled_code))
# We should only trigger Dynamo compilation 4 times:
# 1. forward pass (symbolic)
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# and later calls should not trigger Dynamo compilation again.
# NOTE: It might still trigger XLA compilation.
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have 4 compiled codes
assert len(compiled_codes) == 4
assert len(compiled_codes) == 2
kv_cache_prefix = "kv_cache"
attn_prefix = "ragged_paged_attention"
......@@ -77,24 +65,13 @@ def test_tpu_compilation():
for i, compiled_fn in enumerate(compiled_fns):
print("{} file: {}".format(i + 1, compiled_fn))
# The first compilation is symbolic, so it should not have any kv_caches
# The first compilation should not have any kv_caches
with open(compiled_fns[0]) as f:
content = f.read()
assert kv_cache_prefix not in content
# The second compilation is symbolic, so it should not have any kv_caches
with open(compiled_fns[1]) as f:
content = f.read()
assert kv_cache_prefix not in content
# The third compilation is shape 16, so it should have kv_caches and the
# The second compilation should have kv_caches and the
# ragged_paged_attention
with open(compiled_fns[2]) as f:
content = f.read()
assert (kv_cache_prefix in content and attn_prefix in content)
# The forth compilation is shape 32, so it should have kv_caches and the
# ragged_paged_attention
with open(compiled_fns[3]) as f:
with open(compiled_fns[1]) as f:
content = f.read()
assert (kv_cache_prefix in content and attn_prefix in content)
......@@ -110,6 +110,9 @@ class RemoteOpenAIServer:
self.host = str(args.host or 'localhost')
self.port = int(args.port)
self.show_hidden_metrics = \
args.show_hidden_metrics_for_version is not None
# download the model before starting the server to avoid timeout
is_local = os.path.isdir(model)
if not is_local:
......@@ -323,6 +326,37 @@ def _test_completion_close(
return results
def _test_chat(
client: openai.OpenAI,
model: str,
prompt: str,
):
results = []
messages = [{
"role": "user",
"content": [{
"type": "text",
"text": prompt
}]
}]
# test with text prompt
chat_response = client.chat.completions.create(model=model,
messages=messages,
max_tokens=5,
temperature=0.0)
results.append({
"test": "completion_close",
"text": chat_response.choices[0].message.content,
"finish_reason": chat_response.choices[0].finish_reason,
"usage": chat_response.usage,
})
return results
def _test_embeddings(
client: openai.OpenAI,
model: str,
......@@ -518,6 +552,8 @@ def compare_all_settings(model: str,
results += _test_completion(client, model, prompt, token_ids)
elif method == "generate_close":
results += _test_completion_close(client, model, prompt)
elif method == "generate_chat":
results += _test_chat(client, model, prompt)
elif method == "generate_with_image":
results += _test_image_text(
client, model,
......@@ -585,7 +621,6 @@ def multi_process_parallel(
# as compared to multiprocessing.
# NOTE: We need to set working_dir for distributed tests,
# otherwise we may get import errors on ray workers
# ray.init(num_gpus=tp_size, runtime_env={"working_dir": VLLM_PATH}) xiabo
# NOTE: Force ray not to use gitignore file as excluding, otherwise
# it will not move .so files to working dir.
# So we have to manually add some of large directories
......
......@@ -5,8 +5,12 @@ import torch
from vllm.multimodal.inputs import MultiModalKwargs
from vllm.sampling_params import SamplingParams
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
KVCacheBlock, PrefixCachingMetrics,
from vllm.utils import sha256
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
FreeKVCacheBlockQueue, KVCacheBlock,
PrefixCachingMetrics,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens,
......@@ -16,6 +20,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.request import Request
# yapf: enable
def make_request(request_id,
prompt_token_ids,
......@@ -40,6 +46,12 @@ def make_request(request_id,
)
def test_none_hash():
assert NONE_HASH is not None
assert isinstance(NONE_HASH, int)
assert NONE_HASH != 0
def test_kv_cache_block():
# Test KVCacheBlock initialization
block = KVCacheBlock(block_id=0)
......@@ -190,21 +202,23 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
assert next_mm_idx == 0
def test_hash_block_tokens():
@pytest.mark.parametrize("hash_fn", [sha256, hash])
def test_hash_block_tokens(hash_fn):
parent_block_hash = 123
curr_block_token_ids = (1, 2, 3)
extra_keys = ("key1", "key2")
block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
extra_keys)
block_hash = hash_block_tokens(hash_fn, parent_block_hash,
curr_block_token_ids, extra_keys)
assert isinstance(block_hash, BlockHashType)
assert block_hash.hash_value == hash(
assert block_hash.hash_value == hash_fn(
(parent_block_hash, curr_block_token_ids, extra_keys))
assert block_hash.token_ids == curr_block_token_ids
assert block_hash.extra_keys == extra_keys
def test_hash_request_tokens():
@pytest.mark.parametrize("hash_fn", [sha256, hash])
def test_hash_request_tokens(hash_fn):
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
......@@ -219,7 +233,7 @@ def test_hash_request_tokens():
)
block_size = 3
block_hashes = hash_request_tokens(block_size, request)
block_hashes = hash_request_tokens(hash_fn, block_size, request)
assert len(block_hashes) == 2
assert isinstance(block_hashes[0], BlockHashType)
......@@ -234,7 +248,8 @@ def test_hash_request_tokens():
assert block_hashes[1].extra_keys == ("hash2", )
def test_hash_tokens_different_mm_input():
@pytest.mark.parametrize("hash_fn", [sha256, hash])
def test_hash_tokens_different_mm_input(hash_fn):
request1 = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
......@@ -260,13 +275,14 @@ def test_hash_tokens_different_mm_input():
mm_hashes=["hash3", "hash2"],
)
block_size = 3
block_hashes1 = hash_request_tokens(block_size, request1)
block_hashes2 = hash_request_tokens(block_size, request2)
block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
assert block_hashes1[0] != block_hashes2[0]
assert block_hashes1[1] != block_hashes2[1]
def test_hash_request_tokens_no_mm_inputs():
@pytest.mark.parametrize("hash_fn", [sha256, hash])
def test_hash_request_tokens_no_mm_inputs(hash_fn):
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
......@@ -275,7 +291,7 @@ def test_hash_request_tokens_no_mm_inputs():
)
block_size = 3
block_hashes = hash_request_tokens(block_size, request)
block_hashes = hash_request_tokens(hash_fn, block_size, request)
assert len(block_hashes) == 2
assert block_hashes[0].token_ids == (0, 1, 2)
......
......@@ -4,14 +4,17 @@
from typing import Optional
import pytest
import torch
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.utils import cdiv
from vllm.utils import cdiv, sha256
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
hash_block_tokens)
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec)
def make_request(request_id,
......@@ -39,16 +42,31 @@ def make_request(request_id,
)
def test_prefill():
def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
return KVCacheConfig(
num_blocks=num_blocks,
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
],
)
@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
def test_prefill(hash_algo):
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
caching_hash_algo=hash_algo,
num_preallocate_tokens=16,
)
# choose the hash function according to the parameter
hash_fn = sha256 if hash_algo == "sha256" else hash
# Complete 3 blocks (48 tokens)
common_token_ids = [i for i in range(3) for _ in range(16)]
......@@ -62,19 +80,20 @@ def test_prefill():
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
# Check full block metadata
parent_block_hash = None
for block_id in (0, 1, 2):
block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
block_hash = hash_block_tokens(parent_block_hash, block_tokens)
for block_id in (1, 2, 3):
block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
block_hash = hash_block_tokens(hash_fn, parent_block_hash,
block_tokens)
assert manager.block_pool.blocks[block_id].block_hash == block_hash
assert manager.block_pool.blocks[block_id].ref_cnt == 1
parent_block_hash = block_hash.hash_value
# Check partial/preallocated block metadata
for block_id in (3, 4):
for block_id in (4, 5):
assert manager.block_pool.blocks[block_id].block_hash is None
assert manager.block_pool.blocks[block_id].ref_cnt == 1
......@@ -84,11 +103,11 @@ def test_prefill():
req1 = make_request("1", common_token_ids + unique_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
assert [b.block_id for b in blocks] == [5, 6]
assert [b.block_id for b in blocks] == [6, 7]
for block in computed_blocks:
assert block.ref_cnt == 2
......@@ -101,14 +120,14 @@ def test_prefill():
# All blocks should be available.
assert manager.block_pool.free_block_queue.num_free_blocks == 10
# The order should be
# [unallocated (7, 8, 9)]
# [unique_req0 (4, 3)]
# [unique_req1 (6, 5)]
# [common (2, 1, 0)]
# [unallocated (8, 9, 10)]
# [unique_req0 (5, 4)]
# [unique_req1 (7, 6)]
# [common (3, 2, 1)]
assert [
b.block_id
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
# Cache hit in the common prefix when the original block is already free.
# Incomplete 1 block (6 tokens)
......@@ -116,11 +135,11 @@ def test_prefill():
req2 = make_request("2", common_token_ids + unique_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert len(manager.req_to_block_hashes[req2.request_id]) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
assert [b.block_id for b in blocks] == [7, 8]
assert [b.block_id for b in blocks] == [8, 9]
# Although we only have 5 free blocks, we have 8 blocks in
# the free block queue due to lazy removal.
......@@ -142,7 +161,7 @@ def test_prefill():
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
# This block ID order also checks the eviction order.
assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
assert manager.block_pool.free_block_queue.num_free_blocks == 0
assert manager.block_pool.free_block_queue.free_list_head is None
assert manager.block_pool.free_block_queue.free_list_tail is None
......@@ -156,13 +175,13 @@ def test_prefill_plp():
3. Schedule plp request; no hit should occur; validate blocks
'''
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
# the default hash function is hash
hash_fn = hash
# Complete 3 blocks (48 tokens)
common_token_ids = [i for i in range(3) for _ in range(16)]
......@@ -178,20 +197,21 @@ def test_prefill_plp():
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
req0_block_hashes = [b.block_hash for b in blocks]
# Check full block metadata
parent_block_hash = None
for block_id in (0, 1, 2):
block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
block_hash = hash_block_tokens(parent_block_hash, block_tokens)
for block_id in (1, 2, 3):
block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
block_hash = hash_block_tokens(hash_fn, parent_block_hash,
block_tokens)
assert manager.block_pool.blocks[block_id].block_hash == block_hash
assert manager.block_pool.blocks[block_id].ref_cnt == 1
parent_block_hash = block_hash.hash_value
# Check partial/preallocated block metadata
for block_id in (3, 4):
for block_id in (4, 5):
assert manager.block_pool.blocks[block_id].block_hash is None
assert manager.block_pool.blocks[block_id].ref_cnt == 1
......@@ -202,11 +222,11 @@ def test_prefill_plp():
req1 = make_request("1", common_token_ids + unique_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
assert [b.block_id for b in blocks] == [5, 6]
assert [b.block_id for b in blocks] == [6, 7]
for block in computed_blocks:
assert block.ref_cnt == 2
......@@ -219,14 +239,14 @@ def test_prefill_plp():
# All blocks should be available.
assert manager.block_pool.free_block_queue.num_free_blocks == 10
# The order should be
# [unallocated (7, 8, 9)]
# [unique_req0 (4, 3)]
# [unique_req1 (6, 5)]
# [common (2, 1, 0)]
# [unallocated (8, 9, 10)]
# [unique_req0 (5, 4)]
# [unique_req1 (7, 6)]
# [common (3, 2, 1)]
assert [
b.block_id
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
# Request #2 is a prompt-logprobs request:
# NO cache hit in the common prefix; duplicates request #0 cached blocks
......@@ -242,7 +262,7 @@ def test_prefill_plp():
block_ids = [b.block_id for b in blocks]
# Duplicate cached blocks have different ids but same hashes vs request #0
assert [b.block_hash for b in blocks] == req0_block_hashes
assert block_ids != [0, 1, 2, 3, 4]
assert block_ids != [1, 2, 3, 4, 5]
# Request #2 block hashes are valid since request #0 hashes are.
# Check block reference counts.
......@@ -254,10 +274,8 @@ def test_prefill_plp():
def test_decode():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
......@@ -273,7 +291,7 @@ def test_decode():
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
# Append slots without allocating a new block.
req0.num_computed_tokens = 55
......@@ -307,10 +325,8 @@ def test_decode():
def test_evict():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
......@@ -341,15 +357,15 @@ def test_evict():
assert [
b.block_id
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
# Touch the first 2 blocks.
req2 = make_request("2", list(range(2 * 16 + 3)))
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert [b.block_id for b in computed_blocks] == [0, 1]
assert [b.block_id for b in computed_blocks] == [1, 2]
assert num_computed_tokens == 2 * 16
blocks = manager.allocate_slots(req2, 3, computed_blocks)
assert [b.block_id for b in blocks] == [6, 5]
assert [b.block_id for b in blocks] == [7, 6]
assert manager.block_pool.free_block_queue.num_free_blocks == 6
......@@ -360,10 +376,8 @@ def test_hash_block_correct_reuse():
"""
block_size = 16
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=1,
make_kv_cache_config(16, 2),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
......@@ -399,10 +413,8 @@ def test_computed_blocks_not_evicted():
"""
block_size = 16
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=2,
make_kv_cache_config(block_size, 3),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
......@@ -415,7 +427,7 @@ def test_computed_blocks_not_evicted():
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
assert len(blocks) == 1
assert blocks[0].block_id == 0
assert blocks[0].block_id == 1
# Allocate another block.
req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
......@@ -424,7 +436,7 @@ def test_computed_blocks_not_evicted():
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
assert len(blocks) == 1
assert blocks[0].block_id == 1
assert blocks[0].block_id == 2
# Free the blocks.
manager.free(req0)
......@@ -435,13 +447,13 @@ def test_computed_blocks_not_evicted():
req2 = make_request("2", list(range(num_tokens * 2)))
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert len(computed_blocks) == 1
assert computed_blocks[0].block_id == 0
assert computed_blocks[0].block_id == 1
assert num_computed_tokens == block_size
blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
computed_blocks)
assert len(blocks) == 1
assert blocks[0].block_id == 1
assert blocks[0].block_id == 2
def test_basic_prefix_caching_disabled():
......@@ -450,10 +462,8 @@ def test_basic_prefix_caching_disabled():
"""
block_size = 4
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=4,
make_kv_cache_config(block_size, 5),
max_model_len=8192,
sliding_window=None,
enable_caching=False,
num_preallocate_tokens=0,
)
......@@ -493,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
This tests that the preallocated blocks are correctly added.
"""
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=10,
make_kv_cache_config(block_size, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=num_preallocate_tokens,
)
......@@ -522,7 +530,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
assert len(blocks) == 1 + num_preallocated_blocks
def test_cache_blocks():
@pytest.mark.parametrize("hash_fn", [sha256, hash])
def test_cache_blocks(hash_fn):
"""
This is a unit test that tests the correctness of the _cache_full_blocks
function of KVCacheManager.
......@@ -550,6 +559,7 @@ def test_cache_blocks():
num_cached_blocks=0,
num_full_blocks=2,
block_size=block_size,
hash_fn=hash_fn,
)
assert len(block_pool.cached_block_hash_to_block) == 2
......@@ -564,6 +574,7 @@ def test_cache_blocks():
num_cached_blocks=2,
num_full_blocks=3,
block_size=block_size,
hash_fn=hash_fn,
)
assert len(block_pool.cached_block_hash_to_block) == 3
assert blocks[0].block_hash is not None
......@@ -574,10 +585,8 @@ def test_mm_prefix_caching():
This tests that the multi-modal prefix caching is correct.
"""
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=16,
)
......@@ -617,7 +626,7 @@ def test_mm_prefix_caching():
assert block_hashes[2].extra_keys == ("bbb", )
blocks = manager.allocate_slots(req0, 59, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
req0.num_computed_tokens = 59
# Append slots without allocating a new block.
......@@ -655,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
"""
block_size = 16
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=10,
make_kv_cache_config(block_size, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
......@@ -711,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
def test_reset_prefix_cache():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
make_kv_cache_config(16, 11),
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
......@@ -724,7 +729,7 @@ def test_reset_prefix_cache():
all_token_ids = full_block_token_ids + unique_token_ids
req0 = make_request("0", all_token_ids)
blocks = manager.allocate_slots(req0, 55)
assert [b.block_id for b in blocks] == [0, 1, 2, 3]
assert [b.block_id for b in blocks] == [1, 2, 3, 4]
unique_token_ids = [4] * 7
all_token_ids = full_block_token_ids + unique_token_ids
......@@ -733,7 +738,7 @@ def test_reset_prefix_cache():
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
assert len(computed_blocks) == 3
blocks = manager.allocate_slots(req1, 7, computed_blocks)
assert [b.block_id for b in blocks] == [4]
assert [b.block_id for b in blocks] == [5]
# Failed to reset prefix cache because some blocks are not freed yet.
assert not manager.reset_prefix_cache()
......
......@@ -2,12 +2,15 @@
from typing import Optional
import pytest
import torch
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec)
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
......@@ -20,9 +23,10 @@ def create_scheduler(
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_prefix_caching: Optional[bool] = None,
long_prefill_token_threshold: int = 0,
) -> Scheduler:
'''Create scheduler under test.
Args:
model: model under test
max_num_seqs: max sequences to schedule
......@@ -38,6 +42,7 @@ def create_scheduler(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_num_batched_tokens,
long_prefill_token_threshold=long_prefill_token_threshold,
)
model_config = ModelConfig(
model=model,
......@@ -64,13 +69,21 @@ def create_scheduler(
model_config=model_config,
cache_config=cache_config,
)
kv_cache_config = KVCacheConfig(
num_blocks=10000, # A large number of blocks to hold all requests
tensors={},
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(16, 1, 1, torch.float32, False))
],
)
cache_config.num_gpu_blocks = 10000
return Scheduler(
scheduler_config,
model_config,
cache_config,
speculative_config=None,
lora_config=None,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
)
......@@ -242,7 +255,9 @@ def test_schedule_partial_requests():
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
# Only the first request has a sampled token id because
# the rest requests are still being prefilled.
sampled_token_ids=[[0], [], []],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
......@@ -263,6 +278,86 @@ def test_schedule_partial_requests():
assert requests[2].request_id not in output.num_scheduled_tokens
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
"""Test scheduling behavior with concurrent partial requests.
This test verifies that: there are multiple long prefill requests in the
RUNNING state, and we can schedule them together.
"""
scheduler = create_scheduler(
model="facebook/opt-125m",
max_num_batched_tokens=1024,
long_prefill_token_threshold=400,
enable_prefix_caching=enable_prefix_caching,
)
requests = create_requests(
num_requests=3,
num_tokens=800,
)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 3
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
# The first request is scheduled partially - 400.
assert output.num_scheduled_tokens[requests[0].request_id] == 400
# The second request is scheduled partially - 400.
assert output.num_scheduled_tokens[requests[1].request_id] == 400
# The third request is also scheduled partially - 1024 - 400 - 400 = 224.
assert output.num_scheduled_tokens[requests[2].request_id] == 224
req_to_index = {
request.request_id: i
for i, request in enumerate(requests)
}
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[[] for _ in range(len(requests))],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(output, model_runner_output)
# Schedule the next step. All three requests are running.
# Processed the remaining prefills of the first and second requests.
output1 = scheduler.schedule()
assert len(scheduler.running) == 3
assert len(output1.scheduled_new_reqs) == 0
assert len(output1.scheduled_cached_reqs) == 3
assert len(output1.finished_req_ids) == 0
assert output1.num_scheduled_tokens[requests[0].request_id] == 400
assert output1.num_scheduled_tokens[requests[1].request_id] == 400
assert output1.num_scheduled_tokens[requests[2].request_id] == 224
# Schedule the third step. All three requests are running.
# First and second requests are in the decode stage.
# All the remaining tokens in the third request are processed.
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(output1, model_runner_output)
output2 = scheduler.schedule()
assert len(scheduler.running) == 3
assert len(output2.scheduled_new_reqs) == 0
assert len(output2.scheduled_cached_reqs) == 3
assert len(output2.finished_req_ids) == 0
assert output2.num_scheduled_tokens[requests[0].request_id] == 1
assert output2.num_scheduled_tokens[requests[1].request_id] == 1
assert output2.num_scheduled_tokens[
requests[2].request_id] == 800 - 224 - 224
def test_stop_via_update_from_output():
"""Test stopping behavior through update_from_output"""
scheduler = create_scheduler()
......@@ -516,3 +611,99 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
prompt_logprobs_dict={},
)
scheduler.update_from_output(scheduler_output1, model_runner_output)
# Note - these test cases mirror some of those in test_rejection_sampler.py
@pytest.mark.parametrize(
"spec_tokens,output_tokens,expected",
[
([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)), # perfect match
([[1, 2, 3]], [[1, 5]], (3, 1)), # early mismatch
([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)), # multiple sequences
([[1]], [[1, 2]], (1, 1)), # single token sequence
([[]], [[5]], (0, 0)), # empty sequence
([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
(6, 3)), # multiple mismatches
])
def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
"""Test scheduling behavior with speculative decoding.
This test verifies that:
1. Speculated tokens get scheduled correctly
2. Spec decoding stats properly count number of draft and accepted tokens
"""
scheduler = create_scheduler()
requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
req_ids = []
req_to_index = {}
for i, request in enumerate(requests):
scheduler.add_request(request)
req_ids.append(request.request_id)
req_to_index[request.request_id] = i
# Schedule a decode, which will also draft speculative tokens
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
assert output.total_num_scheduled_tokens == len(requests)
for i in range(len(requests)):
req_id = requests[i].request_id
assert output.num_scheduled_tokens[req_id] == 1
assert req_id not in output.scheduled_spec_decode_tokens
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
spec_token_ids=spec_tokens,
logprobs=None,
prompt_logprobs_dict={},
)
engine_core_outputs = scheduler.update_from_output(output,
model_runner_output)
for i in range(len(requests)):
running_req = scheduler.running[i]
# The prompt token
assert running_req.num_computed_tokens == 1
# The prompt token and the sampled token
assert running_req.num_tokens == 2
# The prompt token, the sampled token, and the speculated tokens
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
# No draft or accepted tokens counted yet
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
# Schedule the speculated tokens for validation
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 0
# The sampled token and speculated tokens
assert output.total_num_scheduled_tokens == \
len(requests) + sum(len(ids) for ids in spec_tokens)
for i in range(len(requests)):
req_id = requests[i].request_id
assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
if spec_tokens[i]:
assert len(output.scheduled_spec_decode_tokens[req_id]) == \
len(spec_tokens[i])
else:
assert req_id not in output.scheduled_spec_decode_tokens
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)
engine_core_outputs = scheduler.update_from_output(output,
model_runner_output)
scheduler_stats = engine_core_outputs.scheduler_stats
if expected[0] == 0:
assert scheduler_stats.spec_decoding_stats is None
else:
assert scheduler_stats.spec_decoding_stats is not None
stats = scheduler_stats.spec_decoding_stats
assert stats.num_draft_tokens == expected[0]
assert stats.num_accepted_tokens == expected[1]
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def model() -> LLM:
return LLM(MODEL,
enforce_eager=True,
enable_prefix_caching=True,
long_prefill_token_threshold=2,
max_num_batched_tokens=6,
max_num_seqs=3)
def test_concurrent_partial_prefill(model):
outputs = model.generate([PROMPT] * 3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
# SPDX-License-Identifier: Apache-2.0
import torch
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
from vllm.v1.core.specialized_manager import SlidingWindowManager
from vllm.v1.kv_cache_interface import SlidingWindowSpec
def test_sliding_window_possible_cached_prefix():
sliding_window_spec = SlidingWindowSpec(
block_size=2,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4,
use_mla=False,
)
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
manager = SlidingWindowManager(sliding_window_spec, block_pool)
def run_one_case(block_is_cached, expect_length):
block_hash_list = [
BlockHashType(i, ()) for i in range(len(block_is_cached))
]
block_pool.cached_block_hash_to_block.clear()
# Mock the block pool with the cached blocks
for i, (block_hash,
is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
if is_cached:
block_pool.cached_block_hash_to_block[block_hash] = {
i: block_pool.blocks[i + 10]
}
computed_blocks = manager.find_longest_cache_hit(block_hash_list)
assert len(computed_blocks) == expect_length
assert all(block == block_pool.null_block
for block in computed_blocks[:expect_length - 2])
for i in range(2):
if i < expect_length:
block_index = expect_length - i - 1
assert computed_blocks[
block_index].block_id == block_index + 10
run_one_case([False] * 10, 0)
run_one_case([True], 1)
run_one_case([True, False], 1)
run_one_case([True, True], 2)
run_one_case([True, True, False], 2)
run_one_case([True, True, True], 3)
run_one_case([True, True, True, False], 3)
run_one_case([
True, True, False, True, False, False, True, True, False, True, True,
True
], 12)
run_one_case([
True, True, False, True, False, False, True, True, False, False, False
], 8)
run_one_case([
True, True, False, True, False, False, True, True, False, False, False,
True
], 8)
def test_sliding_window_remove_skipped_blocks():
sliding_window_spec = SlidingWindowSpec(
block_size=2,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4,
use_mla=False,
)
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
manager = SlidingWindowManager(sliding_window_spec, block_pool)
null_block_id = block_pool.null_block.block_id
def id_to_block_table(ids):
return [
KVCacheBlock(id_)
if id_ != null_block_id else block_pool.null_block for id_ in ids
]
def assert_block_id(block_table, ids):
for block, id_ in zip(block_table, ids):
if id_ == null_block_id:
assert block == block_pool.null_block
else:
assert block.block_id == id_
original_block_ids = [
1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
]
block_table = id_to_block_table(original_block_ids)
removed = manager.remove_skipped_blocks(block_table, 0)
assert_block_id(removed, [])
assert_block_id(block_table, original_block_ids)
# 4 tokens are computed. Only token 0 is out of the sliding window. As
# block 1000 also contains token 1 that is in the sliding window, block 1000
# cannot be removed.
removed = manager.remove_skipped_blocks(block_table, 4)
assert_block_id(removed, [])
assert_block_id(block_table, original_block_ids)
# 5 tokens are computed. Token 0 & 1 are out of the sliding window.
# Block 1000 can be removed.
removed = manager.remove_skipped_blocks(block_table, 5)
assert_block_id(removed, [original_block_ids[0]])
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
# 6 tokens are computed. Token 0-2 are out of the sliding window.
# Cannot remove new block as the block 1001 is still used by token 3.
removed = manager.remove_skipped_blocks(block_table, 6)
assert_block_id(removed, [])
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
# 7 tokens are computed. Token 0-3 are out of the sliding window.
# Block 1001 can be removed and block 1000 is already removed.
removed = manager.remove_skipped_blocks(block_table, 7)
assert_block_id(removed, [original_block_ids[1]])
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
# 11 tokens are computed. Token 0-7 are out of the sliding window.
# Block 1002 & 1003 can be removed now. Block 1003 represents a longer
# sequence, and is expected to be evicted earlier than 1002, so the order
# of removed blocks should be [1003, 1002].
removed = manager.remove_skipped_blocks(block_table, 11)
assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
import pytest
from vllm import LLM, SamplingParams
from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
prep_prompts)
@dataclass
class TestConfig:
sliding_window: int
ln_range: tuple[int, int]
model_config = {
"bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
"google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
}
@pytest.mark.parametrize(
"model",
[
"bigcode/starcoder2-3b", # sliding window only
"google/gemma-2-2b-it", # sliding window + full attention
])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
asks for value of one of them (which is outside the sliding window).
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
test_config = model_config[model]
llm = LLM(model=model)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts(batch_size,
ln_range=test_config.ln_range)
check_length(prompts, llm, test_config.sliding_window)
# Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0)
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0)
def check_length(prompts: list[str], llm: LLM, sliding_window: int):
"""
Check if the prompt length is valid, i.e., longer than the sliding window
size and shorter than the model's max length.
Args:
prompts: list of prompts
llm: LLM object
sliding_window: Sliding window size
"""
tokenizer = llm.get_tokenizer()
max_model_len = llm.llm_engine.model_config.max_model_len
assert any(
len(tokenizer.encode(prompt)) > sliding_window
for prompt in prompts), "Prompt is too short for test"
assert all(
len(tokenizer.encode(prompt)) <= max_model_len
for prompt in prompts), "Prompt is too long for test"
# SPDX-License-Identifier: Apache-2.0
import os
from argparse import ArgumentError
import pytest
from vllm import envs
......@@ -34,6 +36,24 @@ def test_prefix_caching_from_cli():
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.enable_prefix_caching
# default hash algorithm is "builtin"
assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
# set hash algorithm to sha256
args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
# set hash algorithm to builtin
args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
# an invalid hash algorithm raises an error
parser.exit_on_error = False
with pytest.raises(ArgumentError):
args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
def test_defaults_with_usage_context():
engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))
......
......@@ -233,8 +233,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
Test that the engine can handle multiple concurrent batches.
"""
def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
def make_request_with_max_tokens(req_id: int,
max_tokens: int) -> EngineCoreRequest:
request = make_request()
request.request_id = req_id
request.sampling_params.max_tokens = max_tokens
return request
......@@ -281,6 +283,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
# Avoid all requests being scheduled once.
enable_prefix_caching=False,
max_num_batched_tokens=10,
# Reduce startup time.
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
engine_core = EngineCore(vllm_config=vllm_config,
......@@ -288,13 +292,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
executor_class=DummyExecutor)
assert engine_core.batch_queue is not None
# Add two requests in a row.
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
# Add two requests in a row. Each request have 12 prompt tokens.
req0 = make_request_with_max_tokens(0, 5)
engine_core.add_request(req0)
req1 = make_request_with_max_tokens(1, 5)
engine_core.add_request(req1)
# First saturate the batch queue.
# Schedule Batch 1: (10, req0)
assert engine_core.step_with_batch_queue() is None
assert engine_core.batch_queue.qsize() == 1
assert engine_core.step_with_batch_queue() is None
......
......@@ -169,11 +169,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
core_client: SyncMPClient = client
result = core_client._call_utility("echo", "testarg")
result = core_client.call_utility("echo", "testarg")
assert result == "testarg"
with pytest.raises(Exception) as e_info:
core_client._call_utility("echo", None, "help!")
core_client.call_utility("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!"
......@@ -240,10 +240,10 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
core_client: AsyncMPClient = client
result = await core_client._call_utility_async("echo", "testarg")
result = await core_client.call_utility_async("echo", "testarg")
assert result == "testarg"
with pytest.raises(Exception) as e_info:
await core_client._call_utility_async("echo", None, "help!")
await core_client.call_utility_async("echo", None, "help!")
assert str(e_info.value) == "Call to echo method failed: help!"
......@@ -4,78 +4,76 @@ from __future__ import annotations
import json
import re
from enum import Enum
from typing import Any
import jsonschema
import pytest
from pydantic import BaseModel
from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
MODELS_TO_TEST = [
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
"auto"),
("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
"auto"),
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
"mistral"),
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
#FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
]
PARAMS_MODELS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "auto"),
("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_json_completion(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any],
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
outputs = llm.generate(prompts=[
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
] * 2,
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_json_completion_disable_any_whitespace(
@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
def test_structured_output(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str,
sample_sql_lark: str,
sample_regex: str,
sample_guided_choice: str,
guided_decoding_backend: str,
tokenizer_mode: str,
model_name: str,
):
if guided_decoding_backend != "xgrammar":
pytest.skip("disable-any-whitespace is only supported for xgrammar.")
guided_decoding_backend = 'xgrammar:disable-any-whitespace'
monkeypatch.setenv("VLLM_USE_V1", "1")
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm = LLM(model=model_name,
enforce_eager=True,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
guided_decoding_backend=guided_decoding_backend,
tokenizer_mode=tokenizer_mode)
#
# Test 1: Generate JSON output based on a provided schema
#
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
......@@ -96,25 +94,15 @@ def test_guided_json_completion_disable_any_whitespace(
generated_text = output.outputs[0].text
assert generated_text is not None
assert "\n" not in generated_text
if 'disable-any-whitespace' in guided_decoding_backend:
assert "\n" not in generated_text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_json_object(
monkeypatch: pytest.MonkeyPatch,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 2: Generate JSON object without a schema
#
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=100,
......@@ -137,38 +125,18 @@ def test_guided_json_object(
print(generated_text)
assert generated_text is not None
# Parse to verify it is valid JSON
# Parse to verify it is a valid JSON object
parsed_json = json.loads(generated_text)
allowed_types: tuple[type, ...] = (dict, )
if guided_decoding_backend == "xgrammar":
# TODO - we are currently too permissive with xgrammar and
# allow # any valid json (typically comes back as a list or
# object). We can fix this by specifying a jsonschema of
# {"type": "object"}, # but we need this fix in a release
# first: https://github.com/mlc-ai/xgrammar/pull/264
allowed_types = (dict, list)
assert isinstance(parsed_json, allowed_types)
assert isinstance(parsed_json, dict)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1 + ["auto"])
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_json_unsupported_schema(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any],
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 3: test a jsonschema incompatible with xgrammar
#
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
if guided_decoding_backend == "xgrammar":
if guided_decoding_backend.startswith("xgrammar"):
with pytest.raises(ValueError,
match="The provided JSON schema contains features "
"not supported by xgrammar."):
......@@ -179,8 +147,6 @@ def test_guided_json_unsupported_schema(
sampling_params=sampling_params,
use_tqdm=True)
else:
# This should work for both "guidance" and "auto".
outputs = llm.generate(
prompts=("Give an example JSON object for a grade "
"that fits this schema: "
......@@ -199,21 +165,9 @@ def test_guided_json_unsupported_schema(
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_grammar_ebnf(
monkeypatch: pytest.MonkeyPatch,
sample_sql_ebnf: str,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 4: Generate SQL statement using EBNF grammar
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
......@@ -243,21 +197,9 @@ def test_guided_grammar_ebnf(
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_grammar_lark(
monkeypatch: pytest.MonkeyPatch,
sample_sql_lark: str,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 5: Generate SQL statement using Lark grammar
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
......@@ -292,20 +234,9 @@ def test_guided_grammar_lark(
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_grammar_ebnf_invalid(
monkeypatch: pytest.MonkeyPatch,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 6: Test invalid grammar input
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
......@@ -319,21 +250,9 @@ def test_guided_grammar_ebnf_invalid(
use_tqdm=True,
)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_regex(
monkeypatch: pytest.MonkeyPatch,
sample_regex: str,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 7: Generate text based on a regex pattern
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
......@@ -357,21 +276,9 @@ def test_guided_regex(
assert re.fullmatch(sample_regex, generated_text) is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
GUIDED_DECODING_BACKENDS_V1)
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
def test_guided_choice_completion(
monkeypatch: pytest.MonkeyPatch,
sample_guided_choice: str,
guided_decoding_backend: str,
model_name: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend=guided_decoding_backend)
#
# Test 8: Generate text based on a choices
#
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
......@@ -390,3 +297,71 @@ def test_guided_choice_completion(
assert generated_text is not None
assert generated_text in sample_guided_choice
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
#
# Test 9: Generate structured output using a Pydantic model with an enum
#
json_schema = CarDescription.model_json_schema()
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=json_schema))
outputs = llm.generate(
prompts="Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's",
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
assert generated_text is not None
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
output_json = json.loads(generated_text)
jsonschema.validate(instance=output_json, schema=json_schema)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode",
PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any],
model_name: str,
tokenizer_mode: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(model=model_name,
max_model_len=1024,
guided_decoding_backend="auto",
tokenizer_mode=tokenizer_mode)
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=1000,
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
# This would fail with the default of "xgrammar", but in "auto"
# we will handle fallback automatically.
outputs = llm.generate(prompts=("Give an example JSON object for a grade "
"that fits this schema: "
f"{unsupported_json_schema}"),
sampling_params=sampling_params,
use_tqdm=True)
assert outputs is not None
for output in outputs:
assert output is not None
assert isinstance(output, RequestOutput)
generated_text = output.outputs[0].text
assert generated_text is not None
print(generated_text)
# Parse to verify it is valid JSON
parsed_json = json.loads(generated_text)
assert isinstance(parsed_json, dict)
# SPDX-License-Identifier: Apache-2.0
import torch
from torch import Generator
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
DEVICE = "cuda"
BATCH_SIZE = 1024
VOCAB_SIZE = 128 * 1024
def test_topk_impl_equivalance():
with torch.device(DEVICE):
generator = Generator(device=DEVICE).manual_seed(33)
logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
# Random top-k values between 1 and 9.
k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
# Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
k.masked_fill_(
torch.randint(0,
2, (BATCH_SIZE, ),
generator=generator,
dtype=bool), VOCAB_SIZE)
# Top-k only implementation
result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
# Top-p + top-k
no_op_top_p = torch.tensor([1.0])
result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
assert torch.allclose(result1, result2)
......@@ -13,10 +13,6 @@ def unsupported_string_schemas():
"type": "string",
"pattern": "^[a-zA-Z]+$"
},
{
"type": "string",
"enum": ["active", "inactive", "pending"]
},
{
"type": "string",
"minLength": 1
......@@ -164,6 +160,10 @@ def supported_schema():
"type": "number"
}
},
"car_type": {
"type": "string",
"enum": ["sedan", "suv", "truck"]
},
"address": {
"type": "object",
"properties": {
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import os
from contextlib import ExitStack
from typing import Optional
import pytest
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import PromptType
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient
engine_args = AsyncEngineArgs(
model="ibm-research/PowerMoE-3b",
enforce_eager=True,
disable_log_requests=True,
tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
data_parallel_size=int(os.getenv("DP_SIZE", 2)),
)
if not current_platform.supports_v1(engine_args.create_model_config()):
pytest.skip(reason="Requires V1-supporting platform.",
allow_module_level=True)
async def generate(engine: AsyncLLM,
request_id: str,
prompt: PromptType,
output_kind: RequestOutputKind,
max_tokens: int,
prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
# Ensure generate doesn't complete too fast for cancellation test.
await asyncio.sleep(0.2)
count = 0
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True,
output_kind=output_kind,
temperature=0,
prompt_logprobs=prompt_logprobs)
async for out in engine.generate(request_id=request_id,
prompt=prompt,
sampling_params=sampling_params):
num_tokens = len(out.outputs[0].token_ids)
if output_kind == RequestOutputKind.DELTA:
count += num_tokens
else:
count = num_tokens
await asyncio.sleep(0.)
return count, request_id
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.asyncio
async def test_load(output_kind: RequestOutputKind):
with ExitStack() as after:
prompt = "This is a test of data parallel"
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
NUM_REQUESTS = 100
NUM_EXPECTED_TOKENS = 10
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
# Create concurrent requests.
tasks = []
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, prompt, output_kind,
NUM_EXPECTED_TOKENS)))
# Confirm that we got all the EXPECTED tokens from the requests.
done, pending = await asyncio.wait(tasks,
return_when=asyncio.FIRST_EXCEPTION)
for task in pending:
task.cancel()
for task in done:
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
# testing internals here which may break
core_client: DPAsyncMPClient = engine.engine_core
# the engines only synchronize stopping every N steps so
# allow a small amount of time here.
for _ in range(10):
if core_client.num_engines_running == 0:
break
await asyncio.sleep(0.5)
assert core_client.num_engines_running == 0
assert not core_client.reqs_in_flight
......@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
with pytest.raises(NotImplementedError):
AsyncEngineArgs(
model=MODEL,
speculative_model=MODEL,
speculative_config={
"model": MODEL,
},
).create_engine_config()
with pytest.raises(NotImplementedError):
......@@ -102,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for experimental config.
_ = AsyncEngineArgs(
model=MODEL,
enable_lora=True,
).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Should fall back to V0 for supported model.
_ = AsyncEngineArgs(
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
......@@ -123,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
m.delenv("VLLM_USE_V1")
# Should default to V1 for supported config.
model = LLM(MODEL, enforce_eager=True)
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
print(model.generate("Hello my name is"))
assert hasattr(model.llm_engine, "engine_core")
m.delenv("VLLM_USE_V1")
......
......@@ -31,14 +31,12 @@ TENSOR_PARALLEL_SIZES = [1]
reason="This is a basic test for TPU only")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
def test_models(
def test_basic(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str,
max_tokens: int,
enforce_eager: bool,
tensor_parallel_size: int,
) -> None:
prompt = "The next numbers of the sequence " + ", ".join(
......@@ -50,12 +48,15 @@ def test_models(
with vllm_runner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8196,
gpu_memory_utilization=0.7,
max_num_seqs=16,
tensor_parallel_size=tensor_parallel_size) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output
assert "1024" in output or "0, 1" in output
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import ANY, patch
import torch
from vllm.attention.backends.abstract import AttentionType
from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
NUM_QUERIES_PER_BLOCK,
PallasAttentionBackendImpl,
PallasMetadata)
def test_ragged_paged_attention():
# We verify that the kernel inputs such as sliding_window, etc. are passed
# in from the model correctly.
# The correctness of the paged attention kernel is tested in the kernel
# library.
num_heads = 4
head_size = 128
scale = 1.0
num_kv_heads = 4
sliding_window = 128
logits_soft_cap = 50.0
attn_impl = PallasAttentionBackendImpl(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=None,
sliding_window=sliding_window,
kv_cache_dtype="auto",
logits_soft_cap=logits_soft_cap,
attn_type=AttentionType.DECODER,
)
mock_vmem_limit_bytes = 1024
attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
class FakeAttentionLayer:
_k_scale_float: float
_v_scale_float: float
layer = FakeAttentionLayer()
layer._k_scale_float = 1.0
layer._v_scale_float = 1.0
num_tokens = 16
num_blocks = 1024
block_size = 16
query = torch.zeros(num_tokens, num_heads * head_size)
key = torch.zeros(num_tokens, num_kv_heads * head_size)
value = torch.zeros(num_tokens, num_kv_heads * head_size)
kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
max_num_reqs = 8
max_num_blocks_per_req = 8
block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
dtype=torch.int32)
context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
query_lens = [1] * max_num_reqs
query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
dtype=torch.int32),
dim=0,
dtype=torch.int32)
num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
attn_metadata = PallasMetadata(
slot_mapping=slot_mapping,
block_tables=block_tables,
context_lens=context_lens,
query_start_loc=query_start_loc,
num_seqs=num_seqs,
)
with patch("torch.ops.xla.ragged_paged_attention"
) as mock_ragged_paged_attention:
attn_impl.forward(
layer=layer,
query=query,
key=key,
value=value,
kv_cache=kv_cache,
attn_metadata=attn_metadata,
)
mock_ragged_paged_attention.assert_called_once_with(
ANY, # query
ANY, # kv_cache
ANY, # context_lens
ANY, # block_tables
ANY, # query_start_loc
ANY, # num_seqs
num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
num_queries_per_block=NUM_QUERIES_PER_BLOCK,
vmem_limit_bytes=mock_vmem_limit_bytes,
use_kernel=True,
sm_scale=scale,
sliding_window=sliding_window,
soft_cap=logits_soft_cap,
)
# SPDX-License-Identifier: Apache-2.0
"""A basic performance regression test for TPUs
Run `pytest tests/v1/tpu/test_perf.py`.
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING
import numpy as np
import pytest
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
if TYPE_CHECKING:
from tests.conftest import VllmRunner
@dataclass
class TestParams:
model: str
num_prompts: int
prefix_len: int
decode_len: int
expected_avg_time: float
err_tol: float
TEST_PARAMS = [
# TODO: Cannot run a series of tests because:
# RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
# open(/dev/vfio/0): Device or resource busy: Device or resource busy;
# Couldn't open iommu group /dev/vfio/0
# => Investigate
# TestParams(
# model="Qwen/Qwen2.5-1.5B-Instruct",
# num_prompts=1,
# prefix_len=10,
# decode_len=5,
# expected_avg_time=0.03,
# err_tol=0.01,
# ),
# TestParams(
# model="Qwen/Qwen2.5-1.5B-Instruct",
# num_prompts=10,
# prefix_len=100,
# decode_len=50,
# expected_avg_time=0.234,
# err_tol=0.020,
# ),
TestParams(
model="Qwen/Qwen2.5-1.5B-Instruct",
num_prompts=64,
prefix_len=500,
decode_len=50,
# (This is the active CI/CD instance)
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
# tpu: v5lite (vllm CI/CD)
expected_avg_time=1.4,
err_tol=0.30,
# (TODO: There is no v6e in CI/CD currently)
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
# tpu: v6e
# expected_avg_time=1.5,
# err_tol=0.20,
),
]
NUM_WARMUPS = 5
NUM_RUNS = 10
MAX_MODEL_LEN = 1024
MAX_NUM_SEQS = 32
GPU_UTIL = 0.9
@pytest.mark.skipif(not current_platform.is_tpu(),
reason="This is a basic performance test for TPU only")
@pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams,
) -> None:
tokenizer = get_tokenizer(params.model,
tokenizer_mode="auto",
trust_remote_code=True)
prompts = []
for i in range(params.num_prompts):
prefix_token_ids = np.random.randint(0,
tokenizer.vocab_size,
size=params.prefix_len).tolist()
prompt = tokenizer.decode(prefix_token_ids)
prompts.append(prompt)
print(
"-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
len(prompts), params.prefix_len, params.decode_len))
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(max_tokens=params.decode_len,
temperature=1.0,
min_p=0.0)
with vllm_runner(params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times)
print(" -- avg_time = {}".format(avg_time))
print(" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol))
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(
-diff, params.expected_avg_time))
assert ok, " !! ERROR !! Regression detected"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment