Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1521 additions
and
326 deletions
+1521
-326
tests/tool_use/test_tool_choice_required.py
tests/tool_use/test_tool_choice_required.py
+336
-0
tests/tpu/untest_compilation.py
tests/tpu/untest_compilation.py
+17
-40
tests/utils.py
tests/utils.py
+36
-1
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+29
-13
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+80
-75
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+194
-3
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_scheduler_e2e.py
+29
-0
tests/v1/core/test_specialized_manager.py
tests/v1/core/test_specialized_manager.py
+138
-0
tests/v1/e2e/test_correctness_sliding_window.py
tests/v1/e2e/test_correctness_sliding_window.py
+84
-0
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_args.py
+20
-0
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+11
-7
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+4
-4
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+138
-163
tests/v1/sample/test_topk_topp_sampler.py
tests/v1/sample/test_topk_topp_sampler.py
+37
-0
tests/v1/structured_output/test_utils.py
tests/v1/structured_output/test_utils.py
+4
-4
tests/v1/test_async_llm_dp.py
tests/v1/test_async_llm_dp.py
+109
-0
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+4
-10
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+7
-6
tests/v1/tpu/test_pallas.py
tests/v1/tpu/test_pallas.py
+98
-0
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+146
-0
No files found.
tests/tool_use/test_tool_choice_required.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
copy
import
deepcopy
from
unittest.mock
import
MagicMock
import
pytest
from
pydantic
import
TypeAdapter
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionToolsParam
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
EXAMPLE_TOOLS
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for"
", e.g. 'San Francisco'"
,
},
},
"required"
:
[
"city"
],
"additionalProperties"
:
False
},
},
"strict"
:
True
},
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_forecast"
,
"description"
:
"Get the weather forecast for a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to get the forecast for, e.g. 'New York'"
,
},
"days"
:
{
"type"
:
"integer"
,
"description"
:
"Number of days to get the forecast for (1-7)"
,
},
},
"required"
:
[
"city"
,
"days"
],
"additionalProperties"
:
False
},
},
"strict"
:
True
},
]
def
_compile_and_check
(
tools
:
list
[
ChatCompletionToolsParam
],
sample_output
,
should_match
:
bool
):
self
=
MagicMock
(
tool_choice
=
"required"
,
tools
=
tools
)
schema
=
ChatCompletionRequest
.
_get_guided_json_from_tool
(
self
)
assert
isinstance
(
schema
,
dict
)
# use build_regex_from_schema used in JSONLogitsProcessor to create Guide
from
outlines_core.fsm.json_schema
import
build_regex_from_schema
regex
=
build_regex_from_schema
(
json
.
dumps
(
schema
))
compiled
=
re
.
compile
(
regex
)
matches
=
compiled
.
fullmatch
(
json
.
dumps
(
sample_output
))
is
not
None
assert
matches
==
should_match
VALID_TOOL_OUTPUTS
=
[
([{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
}],
True
),
([{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
},
{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Berlin"
}
}],
True
),
([{
"name"
:
"get_forecast"
,
"parameters"
:
{
"city"
:
"Vienna"
,
"days"
:
7
}
}],
True
),
([{
"name"
:
"get_forecast"
,
"parameters"
:
{
"city"
:
"Vienna"
,
"days"
:
7
}
},
{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
}],
True
),
([{
"name"
:
"get_forecast"
,
"parameters"
:
{
"city"
:
"Vienna"
,
"days"
:
7
}
},
{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
},
{
"name"
:
"get_forecast"
,
"parameters"
:
{
"city"
:
"Berlin"
,
"days"
:
7
}
},
{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Berlin"
}
}],
True
),
]
VALID_TOOLS
=
[
t
[
0
]
for
t
in
VALID_TOOL_OUTPUTS
]
@
pytest
.
mark
.
parametrize
(
"sample_output, should_match"
,
VALID_TOOL_OUTPUTS
+
[
(
None
,
False
),
([],
False
),
# empty list cannot be generated
({},
False
),
# empty object cannot be generated
([{}],
False
),
# list with empty object cannot be generated
(
[{
# function without required parameters cannot be generated
"name"
:
"get_current_weather"
}],
False
),
(
[{
# function without required parameters cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
{}
}],
False
),
(
[{
# function without required parameters cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
None
}],
False
),
(
{
# tool call without lists cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
},
False
),
(
[{
# tool call with extra parameters cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
,
"extra"
:
"value"
}
}],
False
),
(
[{
# tool call where parameters are first cannot be generated
"parameters"
:
{
"city"
:
"Vienna"
},
"name"
:
"get_current_weather"
}],
False
),
(
[{
# tool call without all required parameters cannot be generated
"name"
:
"get_forecast"
,
"parameters"
:
{
"city"
:
"Vienna"
}
}],
False
),
(
# tool call with incorrect name/parameters cannot be generated
[{
"name"
:
"get_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
,
"days"
:
7
}
}],
False
),
(
# tool call with both valid and empty function cannot be generated
[{
"name"
:
"get_current_weather"
,
"parameters"
:
{
"city"
:
"Vienna"
}
},
{}],
False
),
])
def
test_guided_json
(
sample_output
,
should_match
):
_compile_and_check
(
tools
=
TypeAdapter
(
list
[
ChatCompletionToolsParam
]).
validate_python
(
EXAMPLE_TOOLS
),
sample_output
=
sample_output
,
should_match
=
should_match
)
def
update_parameters_none
(
tool
:
ChatCompletionToolsParam
)
->
ChatCompletionToolsParam
:
tool
.
function
.
parameters
=
None
return
tool
def
update_parameters_empty_dict
(
tool
:
ChatCompletionToolsParam
)
->
ChatCompletionToolsParam
:
tool
.
function
.
parameters
=
{}
return
tool
@
pytest
.
mark
.
parametrize
(
"sample_output, should_match"
,
[
(
None
,
False
),
([],
False
),
# empty list cannot be generated
({},
False
),
# empty object cannot be generated
([{}],
False
),
# list with empty object cannot be generated
(
[{
# function without required parameters cannot be generated
"name"
:
"get_current_weather"
}],
False
),
(
[{
# function without required parameters cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
None
}],
False
),
(
[{
# function with extra parameters cannot be generated
"name"
:
"get_current_weather"
,
"parameters"
:
{
"extra"
:
"value"
}
}],
False
),
(
[{
# only function with empty parameters object is valid
"name"
:
"get_current_weather"
,
"parameters"
:
{}
}],
True
),
])
@
pytest
.
mark
.
parametrize
(
"update_parameters"
,
[
update_parameters_none
,
update_parameters_empty_dict
])
def
test_guided_json_without_parameters
(
sample_output
,
should_match
,
update_parameters
):
updated_tools
=
[
deepcopy
(
EXAMPLE_TOOLS
[
0
])]
tools
=
TypeAdapter
(
list
[
ChatCompletionToolsParam
]).
validate_python
(
updated_tools
)
tools
=
list
(
map
(
update_parameters
,
tools
))
assert
all
([
tool
.
function
.
parameters
is
None
or
tool
.
function
.
parameters
==
{}
for
tool
in
tools
])
_compile_and_check
(
tools
=
tools
,
sample_output
=
sample_output
,
should_match
=
should_match
)
@
pytest
.
mark
.
parametrize
(
"output"
,
VALID_TOOLS
)
@
pytest
.
mark
.
parametrize
(
"empty_params"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"delta_len"
,
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
])
def
test_streaming_output_valid
(
output
,
empty_params
,
delta_len
):
self
=
MagicMock
()
output
=
deepcopy
(
output
)
if
empty_params
:
output
=
[{
"name"
:
o
[
"name"
],
"parameters"
:
{}}
for
o
in
output
]
output_json
=
json
.
dumps
(
output
)
previous_text
=
""
function_name_returned
=
False
messages
=
[]
for
i
in
range
(
0
,
len
(
output_json
),
delta_len
):
delta_text
=
output_json
[
i
:
i
+
delta_len
]
current_text
=
previous_text
+
delta_text
delta_message
,
function_name_returned
=
(
OpenAIServingChat
.
extract_tool_call_required_streaming
(
self
,
previous_text
=
previous_text
,
current_text
=
current_text
,
delta_text
=
delta_text
,
function_name_returned
=
function_name_returned
))
if
delta_message
:
messages
.
append
(
delta_message
)
previous_text
=
current_text
assert
len
(
messages
)
>
0
combined_messages
=
"["
for
message
in
messages
:
if
message
.
tool_calls
[
0
].
function
.
name
:
if
len
(
combined_messages
)
>
1
:
combined_messages
+=
"},"
combined_messages
+=
'{"name": "'
+
\
message
.
tool_calls
[
0
].
function
.
name
+
\
'", "parameters": '
+
\
message
.
tool_calls
[
0
].
function
.
arguments
else
:
combined_messages
+=
message
.
tool_calls
[
0
].
function
.
arguments
combined_messages
+=
"}]"
assert
json
.
loads
(
combined_messages
)
==
output
assert
json
.
dumps
(
json
.
loads
(
combined_messages
))
==
output_json
tests/tpu/untest_compilation.py
View file @
fcfc474d
...
...
@@ -5,12 +5,8 @@ import os
import
tempfile
import
depyf
import
pytest
from
vllm.config
import
CompilationLevel
@
pytest
.
mark
.
skip
(
reason
=
"Not working; needs investigation."
)
def
test_tpu_compilation
():
temp_dir
=
tempfile
.
mkdtemp
()
with
depyf
.
prepare_debug
(
temp_dir
):
...
...
@@ -22,27 +18,24 @@ def test_tpu_compilation():
"The greatest glory in living lies not in never falling,"
,
]
answers
=
[
" or, through inaction
, allow a human being to come to harm.
"
,
" what is essential
is invisible to the eye.
"
,
" but in rising
every time we fall.
"
,
" or, through inaction"
,
" what is essential "
,
" but in rising "
,
]
N
=
1
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
N
=
1
sampling_params
=
SamplingParams
(
temperature
=
0.7
,
top_p
=
1.0
,
n
=
N
,
max_tokens
=
16
)
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_num_batched_tokens
=
256
,
max_model_len
=
256
,
max_num_seqs
=
32
,
enforce_eager
=
False
)
# disable custom dispatcher, let Dynamo takes over
# all the control
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
max_model_len
=
512
,
max_num_seqs
=
64
,
enforce_eager
=
True
,
compilation_config
=
{
"level"
:
CompilationLevel
.
DYNAMO_AS_IS
})
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
,
answer
in
zip
(
outputs
,
answers
):
prompt
=
output
.
prompt
...
...
@@ -56,16 +49,11 @@ def test_tpu_compilation():
for
i
,
compiled_code
in
enumerate
(
compiled_codes
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_code
))
# We should only trigger Dynamo compilation 4 times:
# 1. forward pass (symbolic)
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# and later calls should not trigger Dynamo compilation again.
# NOTE: It might still trigger XLA compilation.
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have 4 compiled codes
assert
len
(
compiled_codes
)
==
4
assert
len
(
compiled_codes
)
==
2
kv_cache_prefix
=
"kv_cache"
attn_prefix
=
"ragged_paged_attention"
...
...
@@ -77,24 +65,13 @@ def test_tpu_compilation():
for
i
,
compiled_fn
in
enumerate
(
compiled_fns
):
print
(
"{} file: {}"
.
format
(
i
+
1
,
compiled_fn
))
# The first compilation
is symbolic, so it
should not have any kv_caches
# The first compilation should not have any kv_caches
with
open
(
compiled_fns
[
0
])
as
f
:
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The second compilation is symbolic, so it should not have any kv_caches
with
open
(
compiled_fns
[
1
])
as
f
:
content
=
f
.
read
()
assert
kv_cache_prefix
not
in
content
# The third compilation is shape 16, so it should have kv_caches and the
# The second compilation should have kv_caches and the
# ragged_paged_attention
with
open
(
compiled_fns
[
2
])
as
f
:
content
=
f
.
read
()
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
# The forth compilation is shape 32, so it should have kv_caches and the
# ragged_paged_attention
with
open
(
compiled_fns
[
3
])
as
f
:
with
open
(
compiled_fns
[
1
])
as
f
:
content
=
f
.
read
()
assert
(
kv_cache_prefix
in
content
and
attn_prefix
in
content
)
tests/utils.py
View file @
fcfc474d
...
...
@@ -110,6 +110,9 @@ class RemoteOpenAIServer:
self
.
host
=
str
(
args
.
host
or
'localhost'
)
self
.
port
=
int
(
args
.
port
)
self
.
show_hidden_metrics
=
\
args
.
show_hidden_metrics_for_version
is
not
None
# download the model before starting the server to avoid timeout
is_local
=
os
.
path
.
isdir
(
model
)
if
not
is_local
:
...
...
@@ -323,6 +326,37 @@ def _test_completion_close(
return
results
def
_test_chat
(
client
:
openai
.
OpenAI
,
model
:
str
,
prompt
:
str
,
):
results
=
[]
messages
=
[{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
prompt
}]
}]
# test with text prompt
chat_response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
)
results
.
append
({
"test"
:
"completion_close"
,
"text"
:
chat_response
.
choices
[
0
].
message
.
content
,
"finish_reason"
:
chat_response
.
choices
[
0
].
finish_reason
,
"usage"
:
chat_response
.
usage
,
})
return
results
def
_test_embeddings
(
client
:
openai
.
OpenAI
,
model
:
str
,
...
...
@@ -518,6 +552,8 @@ def compare_all_settings(model: str,
results
+=
_test_completion
(
client
,
model
,
prompt
,
token_ids
)
elif
method
==
"generate_close"
:
results
+=
_test_completion_close
(
client
,
model
,
prompt
)
elif
method
==
"generate_chat"
:
results
+=
_test_chat
(
client
,
model
,
prompt
)
elif
method
==
"generate_with_image"
:
results
+=
_test_image_text
(
client
,
model
,
...
...
@@ -585,7 +621,6 @@ def multi_process_parallel(
# as compared to multiprocessing.
# NOTE: We need to set working_dir for distributed tests,
# otherwise we may get import errors on ray workers
# ray.init(num_gpus=tp_size, runtime_env={"working_dir": VLLM_PATH}) xiabo
# NOTE: Force ray not to use gitignore file as excluding, otherwise
# it will not move .so files to working dir.
# So we have to manually add some of large directories
...
...
tests/v1/core/test_kv_cache_utils.py
View file @
fcfc474d
...
...
@@ -5,8 +5,12 @@ import torch
from
vllm.multimodal.inputs
import
MultiModalKwargs
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.kv_cache_utils
import
(
BlockHashType
,
FreeKVCacheBlockQueue
,
KVCacheBlock
,
PrefixCachingMetrics
,
from
vllm.utils
import
sha256
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from
vllm.v1.core.kv_cache_utils
import
(
NONE_HASH
,
BlockHashType
,
FreeKVCacheBlockQueue
,
KVCacheBlock
,
PrefixCachingMetrics
,
generate_block_hash_extra_keys
,
hash_block_tokens
,
hash_request_tokens
,
...
...
@@ -16,6 +20,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
from
vllm.v1.metrics.stats
import
PrefixCacheStats
from
vllm.v1.request
import
Request
# yapf: enable
def
make_request
(
request_id
,
prompt_token_ids
,
...
...
@@ -40,6 +46,12 @@ def make_request(request_id,
)
def
test_none_hash
():
assert
NONE_HASH
is
not
None
assert
isinstance
(
NONE_HASH
,
int
)
assert
NONE_HASH
!=
0
def
test_kv_cache_block
():
# Test KVCacheBlock initialization
block
=
KVCacheBlock
(
block_id
=
0
)
...
...
@@ -190,21 +202,23 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
assert
next_mm_idx
==
0
def
test_hash_block_tokens
():
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
hash
])
def
test_hash_block_tokens
(
hash_fn
):
parent_block_hash
=
123
curr_block_token_ids
=
(
1
,
2
,
3
)
extra_keys
=
(
"key1"
,
"key2"
)
block_hash
=
hash_block_tokens
(
parent_block_hash
,
curr_block_token_ids
,
extra_keys
)
block_hash
=
hash_block_tokens
(
hash_fn
,
parent_block_hash
,
curr_block_token_ids
,
extra_keys
)
assert
isinstance
(
block_hash
,
BlockHashType
)
assert
block_hash
.
hash_value
==
hash
(
assert
block_hash
.
hash_value
==
hash
_fn
(
(
parent_block_hash
,
curr_block_token_ids
,
extra_keys
))
assert
block_hash
.
token_ids
==
curr_block_token_ids
assert
block_hash
.
extra_keys
==
extra_keys
def
test_hash_request_tokens
():
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
hash
])
def
test_hash_request_tokens
(
hash_fn
):
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
...
...
@@ -219,7 +233,7 @@ def test_hash_request_tokens():
)
block_size
=
3
block_hashes
=
hash_request_tokens
(
block_size
,
request
)
block_hashes
=
hash_request_tokens
(
hash_fn
,
block_size
,
request
)
assert
len
(
block_hashes
)
==
2
assert
isinstance
(
block_hashes
[
0
],
BlockHashType
)
...
...
@@ -234,7 +248,8 @@ def test_hash_request_tokens():
assert
block_hashes
[
1
].
extra_keys
==
(
"hash2"
,
)
def
test_hash_tokens_different_mm_input
():
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
hash
])
def
test_hash_tokens_different_mm_input
(
hash_fn
):
request1
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
...
...
@@ -260,13 +275,14 @@ def test_hash_tokens_different_mm_input():
mm_hashes
=
[
"hash3"
,
"hash2"
],
)
block_size
=
3
block_hashes1
=
hash_request_tokens
(
block_size
,
request1
)
block_hashes2
=
hash_request_tokens
(
block_size
,
request2
)
block_hashes1
=
hash_request_tokens
(
hash_fn
,
block_size
,
request1
)
block_hashes2
=
hash_request_tokens
(
hash_fn
,
block_size
,
request2
)
assert
block_hashes1
[
0
]
!=
block_hashes2
[
0
]
assert
block_hashes1
[
1
]
!=
block_hashes2
[
1
]
def
test_hash_request_tokens_no_mm_inputs
():
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
hash
])
def
test_hash_request_tokens_no_mm_inputs
(
hash_fn
):
request
=
make_request
(
request_id
=
0
,
prompt_token_ids
=
[
_
for
_
in
range
(
6
)],
...
...
@@ -275,7 +291,7 @@ def test_hash_request_tokens_no_mm_inputs():
)
block_size
=
3
block_hashes
=
hash_request_tokens
(
block_size
,
request
)
block_hashes
=
hash_request_tokens
(
hash_fn
,
block_size
,
request
)
assert
len
(
block_hashes
)
==
2
assert
block_hashes
[
0
].
token_ids
==
(
0
,
1
,
2
)
...
...
tests/v1/core/test_prefix_caching.py
View file @
fcfc474d
...
...
@@ -4,14 +4,17 @@
from
typing
import
Optional
import
pytest
import
torch
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
PlaceholderRange
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
cdiv
from
vllm.utils
import
cdiv
,
sha256
from
vllm.v1.core.block_pool
import
BlockPool
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
,
Request
from
vllm.v1.core.kv_cache_utils
import
(
BlockHashType
,
KVCacheBlock
,
hash_block_tokens
)
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
KVCacheGroupSpec
)
def
make_request
(
request_id
,
...
...
@@ -39,16 +42,31 @@ def make_request(request_id,
)
def
test_prefill
():
def
make_kv_cache_config
(
block_size
:
int
,
num_blocks
:
int
)
->
KVCacheConfig
:
return
KVCacheConfig
(
num_blocks
=
num_blocks
,
tensors
=
{},
kv_cache_groups
=
[
KVCacheGroupSpec
([
'layer'
],
FullAttentionSpec
(
block_size
,
1
,
1
,
torch
.
float32
,
False
))
],
)
@
pytest
.
mark
.
parametrize
(
"hash_algo"
,
[
"sha256"
,
"hash"
])
def
test_prefill
(
hash_algo
):
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
caching_hash_algo
=
hash_algo
,
num_preallocate_tokens
=
16
,
)
# choose the hash function according to the parameter
hash_fn
=
sha256
if
hash_algo
==
"sha256"
else
hash
# Complete 3 blocks (48 tokens)
common_token_ids
=
[
i
for
i
in
range
(
3
)
for
_
in
range
(
16
)]
...
...
@@ -62,19 +80,20 @@ def test_prefill():
assert
not
computed_blocks
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
1
,
2
,
3
,
4
,
5
]
# Check full block metadata
parent_block_hash
=
None
for
block_id
in
(
0
,
1
,
2
):
block_tokens
=
tuple
(
all_token_ids
[
block_id
*
16
:(
block_id
+
1
)
*
16
])
block_hash
=
hash_block_tokens
(
parent_block_hash
,
block_tokens
)
for
block_id
in
(
1
,
2
,
3
):
block_tokens
=
tuple
(
all_token_ids
[(
block_id
-
1
)
*
16
:
block_id
*
16
])
block_hash
=
hash_block_tokens
(
hash_fn
,
parent_block_hash
,
block_tokens
)
assert
manager
.
block_pool
.
blocks
[
block_id
].
block_hash
==
block_hash
assert
manager
.
block_pool
.
blocks
[
block_id
].
ref_cnt
==
1
parent_block_hash
=
block_hash
.
hash_value
# Check partial/preallocated block metadata
for
block_id
in
(
3
,
4
):
for
block_id
in
(
4
,
5
):
assert
manager
.
block_pool
.
blocks
[
block_id
].
block_hash
is
None
assert
manager
.
block_pool
.
blocks
[
block_id
].
ref_cnt
==
1
...
...
@@ -84,11 +103,11 @@ def test_prefill():
req1
=
make_request
(
"1"
,
common_token_ids
+
unique_token_ids
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
manager
.
req_to_block_hashes
[
req1
.
request_id
])
==
3
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
,
2
]
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
5
,
6
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
6
,
7
]
for
block
in
computed_blocks
:
assert
block
.
ref_cnt
==
2
...
...
@@ -101,14 +120,14 @@ def test_prefill():
# All blocks should be available.
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
10
# The order should be
# [unallocated (
7,
8, 9)]
# [unique_req0 (
4
,
3
)]
# [unique_req1 (
6
,
5
)]
# [common (2, 1
, 0
)]
# [unallocated (8, 9
, 10
)]
# [unique_req0 (
5
,
4
)]
# [unique_req1 (
7
,
6
)]
# [common (
3,
2, 1)]
assert
[
b
.
block_id
for
b
in
manager
.
block_pool
.
free_block_queue
.
get_all_free_blocks
()
]
==
[
7
,
8
,
9
,
4
,
3
,
6
,
5
,
2
,
1
,
0
]
]
==
[
8
,
9
,
10
,
5
,
4
,
7
,
6
,
3
,
2
,
1
]
# Cache hit in the common prefix when the original block is already free.
# Incomplete 1 block (6 tokens)
...
...
@@ -116,11 +135,11 @@ def test_prefill():
req2
=
make_request
(
"2"
,
common_token_ids
+
unique_token_ids
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
len
(
manager
.
req_to_block_hashes
[
req2
.
request_id
])
==
3
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
,
2
]
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req2
,
num_new_tokens
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
7
,
8
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
8
,
9
]
# Although we only have 5 free blocks, we have 8 blocks in
# the free block queue due to lazy removal.
...
...
@@ -142,7 +161,7 @@ def test_prefill():
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req3
,
16
*
9
,
computed_blocks
)
# This block ID order also checks the eviction order.
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
9
,
4
,
3
,
6
,
5
,
8
,
7
,
2
,
1
,
0
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
10
,
5
,
4
,
7
,
6
,
9
,
8
,
3
,
2
,
1
]
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
0
assert
manager
.
block_pool
.
free_block_queue
.
free_list_head
is
None
assert
manager
.
block_pool
.
free_block_queue
.
free_list_tail
is
None
...
...
@@ -156,13 +175,13 @@ def test_prefill_plp():
3. Schedule plp request; no hit should occur; validate blocks
'''
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
# the default hash function is hash
hash_fn
=
hash
# Complete 3 blocks (48 tokens)
common_token_ids
=
[
i
for
i
in
range
(
3
)
for
_
in
range
(
16
)]
...
...
@@ -178,20 +197,21 @@ def test_prefill_plp():
assert
not
computed_blocks
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
1
,
2
,
3
,
4
,
5
]
req0_block_hashes
=
[
b
.
block_hash
for
b
in
blocks
]
# Check full block metadata
parent_block_hash
=
None
for
block_id
in
(
0
,
1
,
2
):
block_tokens
=
tuple
(
all_token_ids
[
block_id
*
16
:(
block_id
+
1
)
*
16
])
block_hash
=
hash_block_tokens
(
parent_block_hash
,
block_tokens
)
for
block_id
in
(
1
,
2
,
3
):
block_tokens
=
tuple
(
all_token_ids
[(
block_id
-
1
)
*
16
:
block_id
*
16
])
block_hash
=
hash_block_tokens
(
hash_fn
,
parent_block_hash
,
block_tokens
)
assert
manager
.
block_pool
.
blocks
[
block_id
].
block_hash
==
block_hash
assert
manager
.
block_pool
.
blocks
[
block_id
].
ref_cnt
==
1
parent_block_hash
=
block_hash
.
hash_value
# Check partial/preallocated block metadata
for
block_id
in
(
3
,
4
):
for
block_id
in
(
4
,
5
):
assert
manager
.
block_pool
.
blocks
[
block_id
].
block_hash
is
None
assert
manager
.
block_pool
.
blocks
[
block_id
].
ref_cnt
==
1
...
...
@@ -202,11 +222,11 @@ def test_prefill_plp():
req1
=
make_request
(
"1"
,
common_token_ids
+
unique_token_ids
)
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
manager
.
req_to_block_hashes
[
req1
.
request_id
])
==
3
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
,
2
]
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
1
,
2
,
3
]
assert
num_computed_tokens
==
3
*
16
num_new_tokens
=
53
-
3
*
16
blocks
=
manager
.
allocate_slots
(
req1
,
num_new_tokens
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
5
,
6
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
6
,
7
]
for
block
in
computed_blocks
:
assert
block
.
ref_cnt
==
2
...
...
@@ -219,14 +239,14 @@ def test_prefill_plp():
# All blocks should be available.
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
10
# The order should be
# [unallocated (
7,
8, 9)]
# [unique_req0 (
4
,
3
)]
# [unique_req1 (
6
,
5
)]
# [common (2, 1
, 0
)]
# [unallocated (8, 9
, 10
)]
# [unique_req0 (
5
,
4
)]
# [unique_req1 (
7
,
6
)]
# [common (
3,
2, 1)]
assert
[
b
.
block_id
for
b
in
manager
.
block_pool
.
free_block_queue
.
get_all_free_blocks
()
]
==
[
7
,
8
,
9
,
4
,
3
,
6
,
5
,
2
,
1
,
0
]
]
==
[
8
,
9
,
10
,
5
,
4
,
7
,
6
,
3
,
2
,
1
]
# Request #2 is a prompt-logprobs request:
# NO cache hit in the common prefix; duplicates request #0 cached blocks
...
...
@@ -242,7 +262,7 @@ def test_prefill_plp():
block_ids
=
[
b
.
block_id
for
b
in
blocks
]
# Duplicate cached blocks have different ids but same hashes vs request #0
assert
[
b
.
block_hash
for
b
in
blocks
]
==
req0_block_hashes
assert
block_ids
!=
[
0
,
1
,
2
,
3
,
4
]
assert
block_ids
!=
[
1
,
2
,
3
,
4
,
5
]
# Request #2 block hashes are valid since request #0 hashes are.
# Check block reference counts.
...
...
@@ -254,10 +274,8 @@ def test_prefill_plp():
def
test_decode
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -273,7 +291,7 @@ def test_decode():
assert
not
computed_blocks
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
1
,
2
,
3
,
4
,
5
]
# Append slots without allocating a new block.
req0
.
num_computed_tokens
=
55
...
...
@@ -307,10 +325,8 @@ def test_decode():
def
test_evict
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -341,15 +357,15 @@ def test_evict():
assert
[
b
.
block_id
for
b
in
manager
.
block_pool
.
free_block_queue
.
get_all_free_blocks
()
]
==
[
6
,
5
,
4
,
3
,
2
,
1
,
0
,
9
,
8
,
7
]
]
==
[
7
,
6
,
5
,
4
,
3
,
2
,
1
,
1
0
,
9
,
8
]
# Touch the first 2 blocks.
req2
=
make_request
(
"2"
,
list
(
range
(
2
*
16
+
3
)))
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
0
,
1
]
assert
[
b
.
block_id
for
b
in
computed_blocks
]
==
[
1
,
2
]
assert
num_computed_tokens
==
2
*
16
blocks
=
manager
.
allocate_slots
(
req2
,
3
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
6
,
5
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
7
,
6
]
assert
manager
.
block_pool
.
free_block_queue
.
num_free_blocks
==
6
...
...
@@ -360,10 +376,8 @@ def test_hash_block_correct_reuse():
"""
block_size
=
16
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
1
,
make_kv_cache_config
(
16
,
2
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -399,10 +413,8 @@ def test_computed_blocks_not_evicted():
"""
block_size
=
16
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
2
,
make_kv_cache_config
(
block_size
,
3
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -415,7 +427,7 @@ def test_computed_blocks_not_evicted():
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req0
,
num_tokens
,
computed_blocks
)
assert
len
(
blocks
)
==
1
assert
blocks
[
0
].
block_id
==
0
assert
blocks
[
0
].
block_id
==
1
# Allocate another block.
req1
=
make_request
(
"1"
,
list
(
range
(
num_tokens
,
num_tokens
*
2
)))
...
...
@@ -424,7 +436,7 @@ def test_computed_blocks_not_evicted():
assert
num_computed_tokens
==
0
blocks
=
manager
.
allocate_slots
(
req1
,
num_tokens
,
computed_blocks
)
assert
len
(
blocks
)
==
1
assert
blocks
[
0
].
block_id
==
1
assert
blocks
[
0
].
block_id
==
2
# Free the blocks.
manager
.
free
(
req0
)
...
...
@@ -435,13 +447,13 @@ def test_computed_blocks_not_evicted():
req2
=
make_request
(
"2"
,
list
(
range
(
num_tokens
*
2
)))
computed_blocks
,
num_computed_tokens
=
manager
.
get_computed_blocks
(
req2
)
assert
len
(
computed_blocks
)
==
1
assert
computed_blocks
[
0
].
block_id
==
0
assert
computed_blocks
[
0
].
block_id
==
1
assert
num_computed_tokens
==
block_size
blocks
=
manager
.
allocate_slots
(
req2
,
num_tokens
*
2
-
num_tokens
,
computed_blocks
)
assert
len
(
blocks
)
==
1
assert
blocks
[
0
].
block_id
==
1
assert
blocks
[
0
].
block_id
==
2
def
test_basic_prefix_caching_disabled
():
...
...
@@ -450,10 +462,8 @@ def test_basic_prefix_caching_disabled():
"""
block_size
=
4
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
4
,
make_kv_cache_config
(
block_size
,
5
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
False
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -493,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
This tests that the preallocated blocks are correctly added.
"""
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
block_size
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
num_preallocate_tokens
,
)
...
...
@@ -522,7 +530,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
assert
len
(
blocks
)
==
1
+
num_preallocated_blocks
def
test_cache_blocks
():
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
hash
])
def
test_cache_blocks
(
hash_fn
):
"""
This is a unit test that tests the correctness of the _cache_full_blocks
function of KVCacheManager.
...
...
@@ -550,6 +559,7 @@ def test_cache_blocks():
num_cached_blocks
=
0
,
num_full_blocks
=
2
,
block_size
=
block_size
,
hash_fn
=
hash_fn
,
)
assert
len
(
block_pool
.
cached_block_hash_to_block
)
==
2
...
...
@@ -564,6 +574,7 @@ def test_cache_blocks():
num_cached_blocks
=
2
,
num_full_blocks
=
3
,
block_size
=
block_size
,
hash_fn
=
hash_fn
,
)
assert
len
(
block_pool
.
cached_block_hash_to_block
)
==
3
assert
blocks
[
0
].
block_hash
is
not
None
...
...
@@ -574,10 +585,8 @@ def test_mm_prefix_caching():
This tests that the multi-modal prefix caching is correct.
"""
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -617,7 +626,7 @@ def test_mm_prefix_caching():
assert
block_hashes
[
2
].
extra_keys
==
(
"bbb"
,
)
blocks
=
manager
.
allocate_slots
(
req0
,
59
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
,
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
1
,
2
,
3
,
4
,
5
]
req0
.
num_computed_tokens
=
59
# Append slots without allocating a new block.
...
...
@@ -655,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
"""
block_size
=
16
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
block_size
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -711,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
def
test_reset_prefix_cache
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
make_kv_cache_config
(
16
,
11
),
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -724,7 +729,7 @@ def test_reset_prefix_cache():
all_token_ids
=
full_block_token_ids
+
unique_token_ids
req0
=
make_request
(
"0"
,
all_token_ids
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
1
,
2
,
3
,
4
]
unique_token_ids
=
[
4
]
*
7
all_token_ids
=
full_block_token_ids
+
unique_token_ids
...
...
@@ -733,7 +738,7 @@ def test_reset_prefix_cache():
assert
len
(
manager
.
req_to_block_hashes
[
req1
.
request_id
])
==
3
assert
len
(
computed_blocks
)
==
3
blocks
=
manager
.
allocate_slots
(
req1
,
7
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
4
]
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
5
]
# Failed to reset prefix cache because some blocks are not freed yet.
assert
not
manager
.
reset_prefix_cache
()
...
...
tests/v1/core/test_scheduler.py
View file @
fcfc474d
...
...
@@ -2,12 +2,15 @@
from
typing
import
Optional
import
pytest
import
torch
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.multimodal.inputs
import
MultiModalKwargs
,
PlaceholderRange
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.core.sched.scheduler
import
Scheduler
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
KVCacheGroupSpec
)
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.request
import
Request
,
RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
...
...
@@ -20,9 +23,10 @@ def create_scheduler(
max_num_seqs
:
int
=
16
,
max_num_batched_tokens
:
int
=
8192
,
enable_prefix_caching
:
Optional
[
bool
]
=
None
,
long_prefill_token_threshold
:
int
=
0
,
)
->
Scheduler
:
'''Create scheduler under test.
Args:
model: model under test
max_num_seqs: max sequences to schedule
...
...
@@ -38,6 +42,7 @@ def create_scheduler(
max_num_seqs
=
max_num_seqs
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_model_len
=
max_num_batched_tokens
,
long_prefill_token_threshold
=
long_prefill_token_threshold
,
)
model_config
=
ModelConfig
(
model
=
model
,
...
...
@@ -64,13 +69,21 @@ def create_scheduler(
model_config
=
model_config
,
cache_config
=
cache_config
,
)
kv_cache_config
=
KVCacheConfig
(
num_blocks
=
10000
,
# A large number of blocks to hold all requests
tensors
=
{},
kv_cache_groups
=
[
KVCacheGroupSpec
([
'layer'
],
FullAttentionSpec
(
16
,
1
,
1
,
torch
.
float32
,
False
))
],
)
cache_config
.
num_gpu_blocks
=
10000
return
Scheduler
(
scheduler_config
,
model_config
,
cache_config
,
speculative_config
=
None
,
lora_config
=
None
,
kv_cache_config
=
kv_cache_config
,
log_stats
=
True
,
structured_output_manager
=
StructuredOutputManager
(
vllm_config
),
)
...
...
@@ -242,7 +255,9 @@ def test_schedule_partial_requests():
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
0
]
for
_
in
range
(
len
(
requests
))],
# Only the first request has a sampled token id because
# the rest requests are still being prefilled.
sampled_token_ids
=
[[
0
],
[],
[]],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
...
...
@@ -263,6 +278,86 @@ def test_schedule_partial_requests():
assert
requests
[
2
].
request_id
not
in
output
.
num_scheduled_tokens
@
pytest
.
mark
.
parametrize
(
"enable_prefix_caching"
,
[
True
,
False
])
def
test_schedule_concurrent_partial_requests
(
enable_prefix_caching
:
bool
):
"""Test scheduling behavior with concurrent partial requests.
This test verifies that: there are multiple long prefill requests in the
RUNNING state, and we can schedule them together.
"""
scheduler
=
create_scheduler
(
model
=
"facebook/opt-125m"
,
max_num_batched_tokens
=
1024
,
long_prefill_token_threshold
=
400
,
enable_prefix_caching
=
enable_prefix_caching
,
)
requests
=
create_requests
(
num_requests
=
3
,
num_tokens
=
800
,
)
for
request
in
requests
:
scheduler
.
add_request
(
request
)
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
3
assert
len
(
output
.
scheduled_cached_reqs
)
==
0
assert
len
(
output
.
finished_req_ids
)
==
0
# The first request is scheduled partially - 400.
assert
output
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
400
# The second request is scheduled partially - 400.
assert
output
.
num_scheduled_tokens
[
requests
[
1
].
request_id
]
==
400
# The third request is also scheduled partially - 1024 - 400 - 400 = 224.
assert
output
.
num_scheduled_tokens
[
requests
[
2
].
request_id
]
==
224
req_to_index
=
{
request
.
request_id
:
i
for
i
,
request
in
enumerate
(
requests
)
}
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[]
for
_
in
range
(
len
(
requests
))],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
scheduler
.
update_from_output
(
output
,
model_runner_output
)
# Schedule the next step. All three requests are running.
# Processed the remaining prefills of the first and second requests.
output1
=
scheduler
.
schedule
()
assert
len
(
scheduler
.
running
)
==
3
assert
len
(
output1
.
scheduled_new_reqs
)
==
0
assert
len
(
output1
.
scheduled_cached_reqs
)
==
3
assert
len
(
output1
.
finished_req_ids
)
==
0
assert
output1
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
400
assert
output1
.
num_scheduled_tokens
[
requests
[
1
].
request_id
]
==
400
assert
output1
.
num_scheduled_tokens
[
requests
[
2
].
request_id
]
==
224
# Schedule the third step. All three requests are running.
# First and second requests are in the decode stage.
# All the remaining tokens in the third request are processed.
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
[
request
.
request_id
for
request
in
requests
],
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
0
],
[
0
]]
+
[[]
for
_
in
range
(
len
(
requests
)
-
2
)],
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
scheduler
.
update_from_output
(
output1
,
model_runner_output
)
output2
=
scheduler
.
schedule
()
assert
len
(
scheduler
.
running
)
==
3
assert
len
(
output2
.
scheduled_new_reqs
)
==
0
assert
len
(
output2
.
scheduled_cached_reqs
)
==
3
assert
len
(
output2
.
finished_req_ids
)
==
0
assert
output2
.
num_scheduled_tokens
[
requests
[
0
].
request_id
]
==
1
assert
output2
.
num_scheduled_tokens
[
requests
[
1
].
request_id
]
==
1
assert
output2
.
num_scheduled_tokens
[
requests
[
2
].
request_id
]
==
800
-
224
-
224
def
test_stop_via_update_from_output
():
"""Test stopping behavior through update_from_output"""
scheduler
=
create_scheduler
()
...
...
@@ -516,3 +611,99 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
prompt_logprobs_dict
=
{},
)
scheduler
.
update_from_output
(
scheduler_output1
,
model_runner_output
)
# Note - these test cases mirror some of those in test_rejection_sampler.py
@
pytest
.
mark
.
parametrize
(
"spec_tokens,output_tokens,expected"
,
[
([[
1
,
2
,
3
]],
[[
1
,
2
,
3
,
4
]],
(
3
,
3
)),
# perfect match
([[
1
,
2
,
3
]],
[[
1
,
5
]],
(
3
,
1
)),
# early mismatch
([[
1
,
2
],
[
3
]],
[[
1
,
2
,
5
],
[
3
,
4
]],
(
3
,
3
)),
# multiple sequences
([[
1
]],
[[
1
,
2
]],
(
1
,
1
)),
# single token sequence
([[]],
[[
5
]],
(
0
,
0
)),
# empty sequence
([[
1
,
2
,
3
],
[
4
,
5
,
6
]],
[[
1
,
2
,
7
],
[
4
,
8
]],
(
6
,
3
)),
# multiple mismatches
])
def
test_schedule_spec_decoding_stats
(
spec_tokens
,
output_tokens
,
expected
):
"""Test scheduling behavior with speculative decoding.
This test verifies that:
1. Speculated tokens get scheduled correctly
2. Spec decoding stats properly count number of draft and accepted tokens
"""
scheduler
=
create_scheduler
()
requests
=
create_requests
(
num_requests
=
len
(
spec_tokens
),
num_tokens
=
1
)
req_ids
=
[]
req_to_index
=
{}
for
i
,
request
in
enumerate
(
requests
):
scheduler
.
add_request
(
request
)
req_ids
.
append
(
request
.
request_id
)
req_to_index
[
request
.
request_id
]
=
i
# Schedule a decode, which will also draft speculative tokens
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
len
(
requests
)
assert
output
.
total_num_scheduled_tokens
==
len
(
requests
)
for
i
in
range
(
len
(
requests
)):
req_id
=
requests
[
i
].
request_id
assert
output
.
num_scheduled_tokens
[
req_id
]
==
1
assert
req_id
not
in
output
.
scheduled_spec_decode_tokens
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
0
]
for
_
in
range
(
len
(
requests
))],
spec_token_ids
=
spec_tokens
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
engine_core_outputs
=
scheduler
.
update_from_output
(
output
,
model_runner_output
)
for
i
in
range
(
len
(
requests
)):
running_req
=
scheduler
.
running
[
i
]
# The prompt token
assert
running_req
.
num_computed_tokens
==
1
# The prompt token and the sampled token
assert
running_req
.
num_tokens
==
2
# The prompt token, the sampled token, and the speculated tokens
assert
running_req
.
num_tokens_with_spec
==
2
+
len
(
spec_tokens
[
i
])
# No draft or accepted tokens counted yet
assert
engine_core_outputs
.
scheduler_stats
.
spec_decoding_stats
is
None
# Schedule the speculated tokens for validation
output
=
scheduler
.
schedule
()
assert
len
(
output
.
scheduled_new_reqs
)
==
0
# The sampled token and speculated tokens
assert
output
.
total_num_scheduled_tokens
==
\
len
(
requests
)
+
sum
(
len
(
ids
)
for
ids
in
spec_tokens
)
for
i
in
range
(
len
(
requests
)):
req_id
=
requests
[
i
].
request_id
assert
output
.
num_scheduled_tokens
[
req_id
]
==
1
+
len
(
spec_tokens
[
i
])
if
spec_tokens
[
i
]:
assert
len
(
output
.
scheduled_spec_decode_tokens
[
req_id
])
==
\
len
(
spec_tokens
[
i
])
else
:
assert
req_id
not
in
output
.
scheduled_spec_decode_tokens
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
output_tokens
,
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
)
engine_core_outputs
=
scheduler
.
update_from_output
(
output
,
model_runner_output
)
scheduler_stats
=
engine_core_outputs
.
scheduler_stats
if
expected
[
0
]
==
0
:
assert
scheduler_stats
.
spec_decoding_stats
is
None
else
:
assert
scheduler_stats
.
spec_decoding_stats
is
not
None
stats
=
scheduler_stats
.
spec_decoding_stats
assert
stats
.
num_draft_tokens
==
expected
[
0
]
assert
stats
.
num_accepted_tokens
==
expected
[
1
]
tests/v1/core/test_scheduler_e2e.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm
import
LLM
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
PROMPT
=
"Hello my name is Robert and I"
@
pytest
.
fixture
(
scope
=
"module"
)
def
model
()
->
LLM
:
return
LLM
(
MODEL
,
enforce_eager
=
True
,
enable_prefix_caching
=
True
,
long_prefill_token_threshold
=
2
,
max_num_batched_tokens
=
6
,
max_num_seqs
=
3
)
def
test_concurrent_partial_prefill
(
model
):
outputs
=
model
.
generate
([
PROMPT
]
*
3
)
assert
len
(
outputs
)
==
3
for
output
in
outputs
:
assert
len
(
output
.
outputs
)
==
1
tests/v1/core/test_specialized_manager.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
torch
from
vllm.v1.core.block_pool
import
BlockPool
from
vllm.v1.core.kv_cache_utils
import
BlockHashType
,
KVCacheBlock
from
vllm.v1.core.specialized_manager
import
SlidingWindowManager
from
vllm.v1.kv_cache_interface
import
SlidingWindowSpec
def
test_sliding_window_possible_cached_prefix
():
sliding_window_spec
=
SlidingWindowSpec
(
block_size
=
2
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
4
,
use_mla
=
False
,
)
block_pool
=
BlockPool
(
num_gpu_blocks
=
100
,
enable_caching
=
True
)
manager
=
SlidingWindowManager
(
sliding_window_spec
,
block_pool
)
def
run_one_case
(
block_is_cached
,
expect_length
):
block_hash_list
=
[
BlockHashType
(
i
,
())
for
i
in
range
(
len
(
block_is_cached
))
]
block_pool
.
cached_block_hash_to_block
.
clear
()
# Mock the block pool with the cached blocks
for
i
,
(
block_hash
,
is_cached
)
in
enumerate
(
zip
(
block_hash_list
,
block_is_cached
)):
if
is_cached
:
block_pool
.
cached_block_hash_to_block
[
block_hash
]
=
{
i
:
block_pool
.
blocks
[
i
+
10
]
}
computed_blocks
=
manager
.
find_longest_cache_hit
(
block_hash_list
)
assert
len
(
computed_blocks
)
==
expect_length
assert
all
(
block
==
block_pool
.
null_block
for
block
in
computed_blocks
[:
expect_length
-
2
])
for
i
in
range
(
2
):
if
i
<
expect_length
:
block_index
=
expect_length
-
i
-
1
assert
computed_blocks
[
block_index
].
block_id
==
block_index
+
10
run_one_case
([
False
]
*
10
,
0
)
run_one_case
([
True
],
1
)
run_one_case
([
True
,
False
],
1
)
run_one_case
([
True
,
True
],
2
)
run_one_case
([
True
,
True
,
False
],
2
)
run_one_case
([
True
,
True
,
True
],
3
)
run_one_case
([
True
,
True
,
True
,
False
],
3
)
run_one_case
([
True
,
True
,
False
,
True
,
False
,
False
,
True
,
True
,
False
,
True
,
True
,
True
],
12
)
run_one_case
([
True
,
True
,
False
,
True
,
False
,
False
,
True
,
True
,
False
,
False
,
False
],
8
)
run_one_case
([
True
,
True
,
False
,
True
,
False
,
False
,
True
,
True
,
False
,
False
,
False
,
True
],
8
)
def
test_sliding_window_remove_skipped_blocks
():
sliding_window_spec
=
SlidingWindowSpec
(
block_size
=
2
,
num_kv_heads
=
1
,
head_size
=
1
,
dtype
=
torch
.
float32
,
sliding_window
=
4
,
use_mla
=
False
,
)
block_pool
=
BlockPool
(
num_gpu_blocks
=
2000
,
enable_caching
=
True
)
manager
=
SlidingWindowManager
(
sliding_window_spec
,
block_pool
)
null_block_id
=
block_pool
.
null_block
.
block_id
def
id_to_block_table
(
ids
):
return
[
KVCacheBlock
(
id_
)
if
id_
!=
null_block_id
else
block_pool
.
null_block
for
id_
in
ids
]
def
assert_block_id
(
block_table
,
ids
):
for
block
,
id_
in
zip
(
block_table
,
ids
):
if
id_
==
null_block_id
:
assert
block
==
block_pool
.
null_block
else
:
assert
block
.
block_id
==
id_
original_block_ids
=
[
1000
,
1001
,
1002
,
1003
,
1004
,
1005
,
1006
,
1007
,
1008
,
1009
,
1010
]
block_table
=
id_to_block_table
(
original_block_ids
)
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
0
)
assert_block_id
(
removed
,
[])
assert_block_id
(
block_table
,
original_block_ids
)
# 4 tokens are computed. Only token 0 is out of the sliding window. As
# block 1000 also contains token 1 that is in the sliding window, block 1000
# cannot be removed.
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
4
)
assert_block_id
(
removed
,
[])
assert_block_id
(
block_table
,
original_block_ids
)
# 5 tokens are computed. Token 0 & 1 are out of the sliding window.
# Block 1000 can be removed.
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
5
)
assert_block_id
(
removed
,
[
original_block_ids
[
0
]])
assert_block_id
(
block_table
,
[
null_block_id
]
+
original_block_ids
[
1
:])
# 6 tokens are computed. Token 0-2 are out of the sliding window.
# Cannot remove new block as the block 1001 is still used by token 3.
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
6
)
assert_block_id
(
removed
,
[])
assert_block_id
(
block_table
,
[
null_block_id
]
+
original_block_ids
[
1
:])
# 7 tokens are computed. Token 0-3 are out of the sliding window.
# Block 1001 can be removed and block 1000 is already removed.
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
7
)
assert_block_id
(
removed
,
[
original_block_ids
[
1
]])
assert_block_id
(
block_table
,
[
null_block_id
]
*
2
+
original_block_ids
[
2
:])
# 11 tokens are computed. Token 0-7 are out of the sliding window.
# Block 1002 & 1003 can be removed now. Block 1003 represents a longer
# sequence, and is expected to be evicted earlier than 1002, so the order
# of removed blocks should be [1003, 1002].
removed
=
manager
.
remove_skipped_blocks
(
block_table
,
11
)
assert_block_id
(
removed
,
[
original_block_ids
[
3
],
original_block_ids
[
2
]])
assert_block_id
(
block_table
,
[
null_block_id
]
*
4
+
original_block_ids
[
4
:])
tests/v1/e2e/test_correctness_sliding_window.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...core.block.e2e.test_correctness_sliding_window
import
(
check_answers
,
prep_prompts
)
@
dataclass
class
TestConfig
:
sliding_window
:
int
ln_range
:
tuple
[
int
,
int
]
model_config
=
{
"bigcode/starcoder2-3b"
:
TestConfig
(
4096
,
(
800
,
1100
)),
"google/gemma-2-2b-it"
:
TestConfig
(
4096
,
(
400
,
800
)),
}
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"bigcode/starcoder2-3b"
,
# sliding window only
"google/gemma-2-2b-it"
,
# sliding window + full attention
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_retrival
(
monkeypatch
,
model
,
batch_size
,
seed
):
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
asks for value of one of them (which is outside the sliding window).
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
test_config
=
model_config
[
model
]
llm
=
LLM
(
model
=
model
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
,
ln_range
=
test_config
.
ln_range
)
check_length
(
prompts
,
llm
,
test_config
.
sliding_window
)
# Fresh generation
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
check_answers
(
indices
,
answer
,
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
accept_rate
=
1.0
)
# Re-generate with the same prompts to test prefix caching
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
check_answers
(
indices
,
answer
,
[
response
.
outputs
[
0
].
text
for
response
in
responses
],
accept_rate
=
1.0
)
def
check_length
(
prompts
:
list
[
str
],
llm
:
LLM
,
sliding_window
:
int
):
"""
Check if the prompt length is valid, i.e., longer than the sliding window
size and shorter than the model's max length.
Args:
prompts: list of prompts
llm: LLM object
sliding_window: Sliding window size
"""
tokenizer
=
llm
.
get_tokenizer
()
max_model_len
=
llm
.
llm_engine
.
model_config
.
max_model_len
assert
any
(
len
(
tokenizer
.
encode
(
prompt
))
>
sliding_window
for
prompt
in
prompts
),
"Prompt is too short for test"
assert
all
(
len
(
tokenizer
.
encode
(
prompt
))
<=
max_model_len
for
prompt
in
prompts
),
"Prompt is too long for test"
tests/v1/engine/test_engine_args.py
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
os
from
argparse
import
ArgumentError
import
pytest
from
vllm
import
envs
...
...
@@ -34,6 +36,24 @@ def test_prefix_caching_from_cli():
vllm_config
=
EngineArgs
.
from_cli_args
(
args
=
args
).
create_engine_config
()
assert
vllm_config
.
cache_config
.
enable_prefix_caching
# default hash algorithm is "builtin"
assert
vllm_config
.
cache_config
.
prefix_caching_hash_algo
==
"builtin"
# set hash algorithm to sha256
args
=
parser
.
parse_args
([
"--prefix-caching-hash-algo"
,
"sha256"
])
vllm_config
=
EngineArgs
.
from_cli_args
(
args
=
args
).
create_engine_config
()
assert
vllm_config
.
cache_config
.
prefix_caching_hash_algo
==
"sha256"
# set hash algorithm to builtin
args
=
parser
.
parse_args
([
"--prefix-caching-hash-algo"
,
"builtin"
])
vllm_config
=
EngineArgs
.
from_cli_args
(
args
=
args
).
create_engine_config
()
assert
vllm_config
.
cache_config
.
prefix_caching_hash_algo
==
"builtin"
# an invalid hash algorithm raises an error
parser
.
exit_on_error
=
False
with
pytest
.
raises
(
ArgumentError
):
args
=
parser
.
parse_args
([
"--prefix-caching-hash-algo"
,
"invalid"
])
def
test_defaults_with_usage_context
():
engine_args
=
EngineArgs
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
))
...
...
tests/v1/engine/test_engine_core.py
View file @
fcfc474d
...
...
@@ -233,8 +233,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
Test that the engine can handle multiple concurrent batches.
"""
def
make_request_with_max_tokens
(
max_tokens
:
int
)
->
EngineCoreRequest
:
def
make_request_with_max_tokens
(
req_id
:
int
,
max_tokens
:
int
)
->
EngineCoreRequest
:
request
=
make_request
()
request
.
request_id
=
req_id
request
.
sampling_params
.
max_tokens
=
max_tokens
return
request
...
...
@@ -281,6 +283,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
# Avoid all requests being scheduled once.
enable_prefix_caching
=
False
,
max_num_batched_tokens
=
10
,
# Reduce startup time.
enforce_eager
=
True
,
)
vllm_config
=
engine_args
.
create_engine_config
()
engine_core
=
EngineCore
(
vllm_config
=
vllm_config
,
...
...
@@ -288,13 +292,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
executor_class
=
DummyExecutor
)
assert
engine_core
.
batch_queue
is
not
None
# Add two requests in a row.
req
=
make_request_with_max_tokens
(
5
)
engine_core
.
add_request
(
req
)
req
=
make_request_with_max_tokens
(
5
)
engine_core
.
add_request
(
req
)
# Add two requests in a row.
Each request have 12 prompt tokens.
req
0
=
make_request_with_max_tokens
(
0
,
5
)
engine_core
.
add_request
(
req
0
)
req
1
=
make_request_with_max_tokens
(
1
,
5
)
engine_core
.
add_request
(
req
1
)
#
First saturate the batch queue.
#
Schedule Batch 1: (10, req0)
assert
engine_core
.
step_with_batch_queue
()
is
None
assert
engine_core
.
batch_queue
.
qsize
()
==
1
assert
engine_core
.
step_with_batch_queue
()
is
None
...
...
tests/v1/engine/test_engine_core_client.py
View file @
fcfc474d
...
...
@@ -169,11 +169,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
core_client
:
SyncMPClient
=
client
result
=
core_client
.
_
call_utility
(
"echo"
,
"testarg"
)
result
=
core_client
.
call_utility
(
"echo"
,
"testarg"
)
assert
result
==
"testarg"
with
pytest
.
raises
(
Exception
)
as
e_info
:
core_client
.
_
call_utility
(
"echo"
,
None
,
"help!"
)
core_client
.
call_utility
(
"echo"
,
None
,
"help!"
)
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
...
...
@@ -240,10 +240,10 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
core_client
:
AsyncMPClient
=
client
result
=
await
core_client
.
_
call_utility_async
(
"echo"
,
"testarg"
)
result
=
await
core_client
.
call_utility_async
(
"echo"
,
"testarg"
)
assert
result
==
"testarg"
with
pytest
.
raises
(
Exception
)
as
e_info
:
await
core_client
.
_
call_utility_async
(
"echo"
,
None
,
"help!"
)
await
core_client
.
call_utility_async
(
"echo"
,
None
,
"help!"
)
assert
str
(
e_info
.
value
)
==
"Call to echo method failed: help!"
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
fcfc474d
...
...
@@ -4,78 +4,76 @@ from __future__ import annotations
import
json
import
re
from
enum
import
Enum
from
typing
import
Any
import
jsonschema
import
pytest
from
pydantic
import
BaseModel
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
GUIDED_DECODING_BACKENDS_V1
=
[
"xgrammar"
,
"guidance"
]
MODELS_TO_TEST
=
[
"Qwen/Qwen2.5-1.5B-Instruct"
,
"mistralai/Ministral-8B-Instruct-2410"
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"guidance:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
"mistral"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
#FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
]
PARAMS_MODELS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"auto"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"auto"
),
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_json_completion
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
class
CarType
(
str
,
Enum
):
sedan
=
"sedan"
suv
=
"SUV"
truck
=
"Truck"
coupe
=
"Coupe"
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
class
CarDescription
(
BaseModel
):
brand
:
str
model
:
str
car_type
:
CarType
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_json_completion_disable_any_whitespace
(
@
pytest
.
mark
.
parametrize
(
"model_name, guided_decoding_backend, tokenizer_mode"
,
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
)
def
test_structured_output
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
unsupported_json_schema
:
dict
[
str
,
Any
],
sample_sql_ebnf
:
str
,
sample_sql_lark
:
str
,
sample_regex
:
str
,
sample_guided_choice
:
str
,
guided_decoding_backend
:
str
,
tokenizer_mode
:
str
,
model_name
:
str
,
):
if
guided_decoding_backend
!=
"xgrammar"
:
pytest
.
skip
(
"disable-any-whitespace is only supported for xgrammar."
)
guided_decoding_backend
=
'xgrammar:disable-any-whitespace'
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm
=
LLM
(
model
=
model_name
,
enforce_eager
=
True
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
guided_decoding_backend
=
guided_decoding_backend
,
tokenizer_mode
=
tokenizer_mode
)
#
# Test 1: Generate JSON output based on a provided schema
#
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
...
...
@@ -96,25 +94,15 @@ def test_guided_json_completion_disable_any_whitespace(
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
assert
"
\n
"
not
in
generated_text
if
'disable-any-whitespace'
in
guided_decoding_backend
:
assert
"
\n
"
not
in
generated_text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_json_object
(
monkeypatch
:
pytest
.
MonkeyPatch
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 2: Generate JSON object without a schema
#
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
100
,
...
...
@@ -137,38 +125,18 @@ def test_guided_json_object(
print
(
generated_text
)
assert
generated_text
is
not
None
# Parse to verify it is valid JSON
# Parse to verify it is
a
valid JSON
object
parsed_json
=
json
.
loads
(
generated_text
)
allowed_types
:
tuple
[
type
,
...]
=
(
dict
,
)
if
guided_decoding_backend
==
"xgrammar"
:
# TODO - we are currently too permissive with xgrammar and
# allow # any valid json (typically comes back as a list or
# object). We can fix this by specifying a jsonschema of
# {"type": "object"}, # but we need this fix in a release
# first: https://github.com/mlc-ai/xgrammar/pull/264
allowed_types
=
(
dict
,
list
)
assert
isinstance
(
parsed_json
,
allowed_types
)
assert
isinstance
(
parsed_json
,
dict
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
+
[
"auto"
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_json_unsupported_schema
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 3: test a jsonschema incompatible with xgrammar
#
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
unsupported_json_schema
))
if
guided_decoding_backend
==
"xgrammar"
:
if
guided_decoding_backend
.
startswith
(
"xgrammar"
)
:
with
pytest
.
raises
(
ValueError
,
match
=
"The provided JSON schema contains features "
"not supported by xgrammar."
):
...
...
@@ -179,8 +147,6 @@ def test_guided_json_unsupported_schema(
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
else
:
# This should work for both "guidance" and "auto".
outputs
=
llm
.
generate
(
prompts
=
(
"Give an example JSON object for a grade "
"that fits this schema: "
...
...
@@ -199,21 +165,9 @@ def test_guided_json_unsupported_schema(
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_grammar_ebnf
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_sql_ebnf
:
str
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 4: Generate SQL statement using EBNF grammar
#
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
...
...
@@ -243,21 +197,9 @@ def test_guided_grammar_ebnf(
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_grammar_lark
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_sql_lark
:
str
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 5: Generate SQL statement using Lark grammar
#
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
...
...
@@ -292,20 +234,9 @@ def test_guided_grammar_lark(
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_grammar_ebnf_invalid
(
monkeypatch
:
pytest
.
MonkeyPatch
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 6: Test invalid grammar input
#
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
...
...
@@ -319,21 +250,9 @@ def test_guided_grammar_ebnf_invalid(
use_tqdm
=
True
,
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_regex
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_regex
:
str
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 7: Generate text based on a regex pattern
#
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
...
...
@@ -357,21 +276,9 @@ def test_guided_regex(
assert
re
.
fullmatch
(
sample_regex
,
generated_text
)
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS_V1
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS_TO_TEST
)
def
test_guided_choice_completion
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_guided_choice
:
str
,
guided_decoding_backend
:
str
,
model_name
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
)
#
# Test 8: Generate text based on a choices
#
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
...
...
@@ -390,3 +297,71 @@ def test_guided_choice_completion(
assert
generated_text
is
not
None
assert
generated_text
in
sample_guided_choice
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
#
# Test 9: Generate structured output using a Pydantic model with an enum
#
json_schema
=
CarDescription
.
model_json_schema
()
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
json_schema
))
outputs
=
llm
.
generate
(
prompts
=
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
"model_name, tokenizer_mode"
,
PARAMS_MODELS_TOKENIZER_MODE
)
def
test_structured_output_auto_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
,
unsupported_json_schema
:
dict
[
str
,
Any
],
model_name
:
str
,
tokenizer_mode
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
1024
,
guided_decoding_backend
=
"auto"
,
tokenizer_mode
=
tokenizer_mode
)
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
unsupported_json_schema
))
# This would fail with the default of "xgrammar", but in "auto"
# we will handle fallback automatically.
outputs
=
llm
.
generate
(
prompts
=
(
"Give an example JSON object for a grade "
"that fits this schema: "
f
"
{
unsupported_json_schema
}
"
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
generated_text
)
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
tests/v1/sample/test_topk_topp_sampler.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
torch
from
torch
import
Generator
from
vllm.v1.sample.ops.topk_topp_sampler
import
apply_top_k_top_p
DEVICE
=
"cuda"
BATCH_SIZE
=
1024
VOCAB_SIZE
=
128
*
1024
def
test_topk_impl_equivalance
():
with
torch
.
device
(
DEVICE
):
generator
=
Generator
(
device
=
DEVICE
).
manual_seed
(
33
)
logits
=
torch
.
rand
((
BATCH_SIZE
,
VOCAB_SIZE
),
generator
=
generator
)
# Random top-k values between 1 and 9.
k
=
torch
.
randint
(
1
,
10
,
(
BATCH_SIZE
,
),
generator
=
generator
)
# Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
k
.
masked_fill_
(
torch
.
randint
(
0
,
2
,
(
BATCH_SIZE
,
),
generator
=
generator
,
dtype
=
bool
),
VOCAB_SIZE
)
# Top-k only implementation
result1
=
apply_top_k_top_p
(
logits
=
logits
.
clone
(),
k
=
k
,
p
=
None
)
# Top-p + top-k
no_op_top_p
=
torch
.
tensor
([
1.0
])
result2
=
apply_top_k_top_p
(
logits
=
logits
.
clone
(),
k
=
k
,
p
=
no_op_top_p
)
assert
torch
.
allclose
(
result1
,
result2
)
tests/v1/structured_output/test_utils.py
View file @
fcfc474d
...
...
@@ -13,10 +13,6 @@ def unsupported_string_schemas():
"type"
:
"string"
,
"pattern"
:
"^[a-zA-Z]+$"
},
{
"type"
:
"string"
,
"enum"
:
[
"active"
,
"inactive"
,
"pending"
]
},
{
"type"
:
"string"
,
"minLength"
:
1
...
...
@@ -164,6 +160,10 @@ def supported_schema():
"type"
:
"number"
}
},
"car_type"
:
{
"type"
:
"string"
,
"enum"
:
[
"sedan"
,
"suv"
,
"truck"
]
},
"address"
:
{
"type"
:
"object"
,
"properties"
:
{
...
...
tests/v1/test_async_llm_dp.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
os
from
contextlib
import
ExitStack
from
typing
import
Optional
import
pytest
from
vllm
import
SamplingParams
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.inputs
import
PromptType
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.core_client
import
DPAsyncMPClient
engine_args
=
AsyncEngineArgs
(
model
=
"ibm-research/PowerMoE-3b"
,
enforce_eager
=
True
,
disable_log_requests
=
True
,
tensor_parallel_size
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
)),
data_parallel_size
=
int
(
os
.
getenv
(
"DP_SIZE"
,
2
)),
)
if
not
current_platform
.
supports_v1
(
engine_args
.
create_model_config
()):
pytest
.
skip
(
reason
=
"Requires V1-supporting platform."
,
allow_module_level
=
True
)
async
def
generate
(
engine
:
AsyncLLM
,
request_id
:
str
,
prompt
:
PromptType
,
output_kind
:
RequestOutputKind
,
max_tokens
:
int
,
prompt_logprobs
:
Optional
[
int
]
=
None
)
->
tuple
[
int
,
str
]:
# Ensure generate doesn't complete too fast for cancellation test.
await
asyncio
.
sleep
(
0.2
)
count
=
0
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
,
output_kind
=
output_kind
,
temperature
=
0
,
prompt_logprobs
=
prompt_logprobs
)
async
for
out
in
engine
.
generate
(
request_id
=
request_id
,
prompt
=
prompt
,
sampling_params
=
sampling_params
):
num_tokens
=
len
(
out
.
outputs
[
0
].
token_ids
)
if
output_kind
==
RequestOutputKind
.
DELTA
:
count
+=
num_tokens
else
:
count
=
num_tokens
await
asyncio
.
sleep
(
0.
)
return
count
,
request_id
@
pytest
.
mark
.
parametrize
(
"output_kind"
,
[
RequestOutputKind
.
DELTA
,
RequestOutputKind
.
FINAL_ONLY
])
@
pytest
.
mark
.
asyncio
async
def
test_load
(
output_kind
:
RequestOutputKind
):
with
ExitStack
()
as
after
:
prompt
=
"This is a test of data parallel"
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
after
.
callback
(
engine
.
shutdown
)
NUM_REQUESTS
=
100
NUM_EXPECTED_TOKENS
=
10
request_ids
=
[
f
"request-
{
i
}
"
for
i
in
range
(
NUM_REQUESTS
)]
# Create concurrent requests.
tasks
=
[]
for
request_id
in
request_ids
:
tasks
.
append
(
asyncio
.
create_task
(
generate
(
engine
,
request_id
,
prompt
,
output_kind
,
NUM_EXPECTED_TOKENS
)))
# Confirm that we got all the EXPECTED tokens from the requests.
done
,
pending
=
await
asyncio
.
wait
(
tasks
,
return_when
=
asyncio
.
FIRST_EXCEPTION
)
for
task
in
pending
:
task
.
cancel
()
for
task
in
done
:
num_generated_tokens
,
request_id
=
await
task
assert
num_generated_tokens
==
NUM_EXPECTED_TOKENS
,
(
f
"
{
request_id
}
generated
{
num_generated_tokens
}
but "
f
"expected
{
NUM_EXPECTED_TOKENS
}
"
)
assert
not
engine
.
output_processor
.
has_unfinished_requests
()
# testing internals here which may break
core_client
:
DPAsyncMPClient
=
engine
.
engine_core
# the engines only synchronize stopping every N steps so
# allow a small amount of time here.
for
_
in
range
(
10
):
if
core_client
.
num_engines_running
==
0
:
break
await
asyncio
.
sleep
(
0.5
)
assert
core_client
.
num_engines_running
==
0
assert
not
core_client
.
reqs_in_flight
tests/v1/test_oracle.py
View file @
fcfc474d
...
...
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
with
pytest
.
raises
(
NotImplementedError
):
AsyncEngineArgs
(
model
=
MODEL
,
speculative_model
=
MODEL
,
speculative_config
=
{
"model"
:
MODEL
,
},
).
create_engine_config
()
with
pytest
.
raises
(
NotImplementedError
):
...
...
@@ -102,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
assert
envs
.
VLLM_USE_V1
m
.
delenv
(
"VLLM_USE_V1"
)
# Should fall back to V0 for experimental config.
_
=
AsyncEngineArgs
(
model
=
MODEL
,
enable_lora
=
True
,
).
create_engine_config
()
assert
not
envs
.
VLLM_USE_V1
m
.
delenv
(
"VLLM_USE_V1"
)
# Should fall back to V0 for supported model.
_
=
AsyncEngineArgs
(
model
=
UNSUPPORTED_MODELS_V1
[
0
]).
create_engine_config
()
...
...
@@ -123,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
m
.
delenv
(
"VLLM_USE_V1"
)
# Should default to V1 for supported config.
model
=
LLM
(
MODEL
,
enforce_eager
=
True
)
model
=
LLM
(
MODEL
,
enforce_eager
=
True
,
enable_lora
=
True
)
print
(
model
.
generate
(
"Hello my name is"
))
assert
hasattr
(
model
.
llm_engine
,
"engine_core"
)
m
.
delenv
(
"VLLM_USE_V1"
)
...
...
tests/v1/tpu/test_basic.py
View file @
fcfc474d
...
...
@@ -31,14 +31,12 @@ TENSOR_PARALLEL_SIZES = [1]
reason
=
"This is a basic test for TPU only"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
TENSOR_PARALLEL_SIZES
)
def
test_
models
(
def
test_
basic
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
max_tokens
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
)
->
None
:
prompt
=
"The next numbers of the sequence "
+
", "
.
join
(
...
...
@@ -50,12 +48,15 @@ def test_models(
with
vllm_runner
(
model
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens
=
1024
,
max_model_len
=
8196
,
gpu_memory_utilization
=
0.7
,
max_num_seqs
=
16
,
tensor_parallel_size
=
tensor_parallel_size
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
output
=
vllm_outputs
[
0
][
1
]
assert
"1024"
in
output
assert
"1024"
in
output
or
"0, 1"
in
output
tests/v1/tpu/test_pallas.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
from
unittest.mock
import
ANY
,
patch
import
torch
from
vllm.attention.backends.abstract
import
AttentionType
from
vllm.v1.attention.backends.pallas
import
(
NUM_KV_PAGES_PER_BLOCK
,
NUM_QUERIES_PER_BLOCK
,
PallasAttentionBackendImpl
,
PallasMetadata
)
def
test_ragged_paged_attention
():
# We verify that the kernel inputs such as sliding_window, etc. are passed
# in from the model correctly.
# The correctness of the paged attention kernel is tested in the kernel
# library.
num_heads
=
4
head_size
=
128
scale
=
1.0
num_kv_heads
=
4
sliding_window
=
128
logits_soft_cap
=
50.0
attn_impl
=
PallasAttentionBackendImpl
(
num_heads
=
num_heads
,
head_size
=
head_size
,
scale
=
scale
,
num_kv_heads
=
num_kv_heads
,
alibi_slopes
=
None
,
sliding_window
=
sliding_window
,
kv_cache_dtype
=
"auto"
,
logits_soft_cap
=
logits_soft_cap
,
attn_type
=
AttentionType
.
DECODER
,
)
mock_vmem_limit_bytes
=
1024
attn_impl
.
vmem_limit_bytes
=
mock_vmem_limit_bytes
class
FakeAttentionLayer
:
_k_scale_float
:
float
_v_scale_float
:
float
layer
=
FakeAttentionLayer
()
layer
.
_k_scale_float
=
1.0
layer
.
_v_scale_float
=
1.0
num_tokens
=
16
num_blocks
=
1024
block_size
=
16
query
=
torch
.
zeros
(
num_tokens
,
num_heads
*
head_size
)
key
=
torch
.
zeros
(
num_tokens
,
num_kv_heads
*
head_size
)
value
=
torch
.
zeros
(
num_tokens
,
num_kv_heads
*
head_size
)
kv_cache
=
torch
.
zeros
(
num_blocks
,
block_size
,
num_kv_heads
*
2
,
head_size
)
slot_mapping
=
torch
.
zeros
(
num_tokens
,
dtype
=
torch
.
int64
)
max_num_reqs
=
8
max_num_blocks_per_req
=
8
block_tables
=
torch
.
zeros
((
max_num_reqs
,
max_num_blocks_per_req
),
dtype
=
torch
.
int32
)
context_lens
=
torch
.
ones
((
max_num_reqs
,
),
dtype
=
torch
.
int32
)
query_lens
=
[
1
]
*
max_num_reqs
query_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
query_lens
,
dtype
=
torch
.
int32
),
dim
=
0
,
dtype
=
torch
.
int32
)
num_seqs
=
torch
.
tensor
([
max_num_reqs
],
dtype
=
torch
.
int32
)
attn_metadata
=
PallasMetadata
(
slot_mapping
=
slot_mapping
,
block_tables
=
block_tables
,
context_lens
=
context_lens
,
query_start_loc
=
query_start_loc
,
num_seqs
=
num_seqs
,
)
with
patch
(
"torch.ops.xla.ragged_paged_attention"
)
as
mock_ragged_paged_attention
:
attn_impl
.
forward
(
layer
=
layer
,
query
=
query
,
key
=
key
,
value
=
value
,
kv_cache
=
kv_cache
,
attn_metadata
=
attn_metadata
,
)
mock_ragged_paged_attention
.
assert_called_once_with
(
ANY
,
# query
ANY
,
# kv_cache
ANY
,
# context_lens
ANY
,
# block_tables
ANY
,
# query_start_loc
ANY
,
# num_seqs
num_kv_pages_per_block
=
NUM_KV_PAGES_PER_BLOCK
,
num_queries_per_block
=
NUM_QUERIES_PER_BLOCK
,
vmem_limit_bytes
=
mock_vmem_limit_bytes
,
use_kernel
=
True
,
sm_scale
=
scale
,
sliding_window
=
sliding_window
,
soft_cap
=
logits_soft_cap
,
)
tests/v1/tpu/test_perf.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
"""A basic performance regression test for TPUs
Run `pytest tests/v1/tpu/test_perf.py`.
"""
from
__future__
import
annotations
import
time
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
import
numpy
as
np
import
pytest
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
if
TYPE_CHECKING
:
from
tests.conftest
import
VllmRunner
@
dataclass
class
TestParams
:
model
:
str
num_prompts
:
int
prefix_len
:
int
decode_len
:
int
expected_avg_time
:
float
err_tol
:
float
TEST_PARAMS
=
[
# TODO: Cannot run a series of tests because:
# RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
# open(/dev/vfio/0): Device or resource busy: Device or resource busy;
# Couldn't open iommu group /dev/vfio/0
# => Investigate
# TestParams(
# model="Qwen/Qwen2.5-1.5B-Instruct",
# num_prompts=1,
# prefix_len=10,
# decode_len=5,
# expected_avg_time=0.03,
# err_tol=0.01,
# ),
# TestParams(
# model="Qwen/Qwen2.5-1.5B-Instruct",
# num_prompts=10,
# prefix_len=100,
# decode_len=50,
# expected_avg_time=0.234,
# err_tol=0.020,
# ),
TestParams
(
model
=
"Qwen/Qwen2.5-1.5B-Instruct"
,
num_prompts
=
64
,
prefix_len
=
500
,
decode_len
=
50
,
# (This is the active CI/CD instance)
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
# tpu: v5lite (vllm CI/CD)
expected_avg_time
=
1.4
,
err_tol
=
0.30
,
# (TODO: There is no v6e in CI/CD currently)
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
# tpu: v6e
# expected_avg_time=1.5,
# err_tol=0.20,
),
]
NUM_WARMUPS
=
5
NUM_RUNS
=
10
MAX_MODEL_LEN
=
1024
MAX_NUM_SEQS
=
32
GPU_UTIL
=
0.9
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_tpu
(),
reason
=
"This is a basic performance test for TPU only"
)
@
pytest
.
mark
.
parametrize
(
"params"
,
TEST_PARAMS
)
def
test_perf
(
vllm_runner
:
type
[
VllmRunner
],
monkeypatch
:
pytest
.
MonkeyPatch
,
params
:
TestParams
,
)
->
None
:
tokenizer
=
get_tokenizer
(
params
.
model
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
)
prompts
=
[]
for
i
in
range
(
params
.
num_prompts
):
prefix_token_ids
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
params
.
prefix_len
).
tolist
()
prompt
=
tokenizer
.
decode
(
prefix_token_ids
)
prompts
.
append
(
prompt
)
print
(
"-- Running: num_prompts = {} prefix_len = {} decode_len = {}"
.
format
(
len
(
prompts
),
params
.
prefix_len
,
params
.
decode_len
))
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
sampling_params
=
SamplingParams
(
max_tokens
=
params
.
decode_len
,
temperature
=
1.0
,
min_p
=
0.0
)
with
vllm_runner
(
params
.
model
,
max_num_batched_tokens
=
MAX_MODEL_LEN
,
max_model_len
=
MAX_MODEL_LEN
,
max_num_seqs
=
MAX_NUM_SEQS
,
gpu_memory_utilization
=
GPU_UTIL
,
enforce_eager
=
False
,
tensor_parallel_size
=
1
)
as
vllm_model
:
print
(
" -- Warmup / Compile"
)
for
i
in
range
(
NUM_WARMUPS
):
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
print
(
" -- Benchmarking... "
)
times
=
[]
for
i
in
range
(
NUM_RUNS
):
start_time
=
time
.
time
()
_
=
vllm_model
.
generate
(
prompts
,
sampling_params
)
times
.
append
(
time
.
time
()
-
start_time
)
avg_time
=
sum
(
times
)
/
len
(
times
)
print
(
" -- avg_time = {}"
.
format
(
avg_time
))
print
(
" -- expected_avg_time = {} with err_tol = {}"
.
format
(
params
.
expected_avg_time
,
params
.
err_tol
))
diff
=
avg_time
-
params
.
expected_avg_time
ok
=
diff
<
params
.
err_tol
if
diff
<
-
params
.
err_tol
:
print
(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}"
.
format
(
-
diff
,
params
.
expected_avg_time
))
assert
ok
,
" !! ERROR !! Regression detected"
Prev
1
…
9
10
11
12
13
14
15
16
17
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment