Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b9e12416
Commit
b9e12416
authored
May 31, 2024
by
zhuwenwen
Browse files
merge v0.4.3
parents
e5d707db
e9d3aa04
Changes
345
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1182 additions
and
190 deletions
+1182
-190
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+13
-17
tests/async_engine/test_openapi_server_ray.py
tests/async_engine/test_openapi_server_ray.py
+8
-51
tests/basic_correctness/__init__.py
tests/basic_correctness/__init__.py
+0
-0
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+13
-0
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+43
-1
tests/conftest.py
tests/conftest.py
+153
-37
tests/core/block/e2e/__init__.py
tests/core/block/e2e/__init__.py
+0
-0
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+28
-1
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+2
-9
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+168
-0
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+222
-1
tests/core/block/test_block_table.py
tests/core/block/test_block_table.py
+2
-4
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+117
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+243
-13
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+16
-16
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+24
-23
tests/core/utils.py
tests/core/utils.py
+112
-8
tests/distributed/__init__.py
tests/distributed/__init__.py
+0
-0
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+14
-9
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+4
-0
No files found.
Too many changes to show.
To preserve performance only
345 of 345+
files are displayed.
Plain diff
Email patch
tests/async_engine/test_chat_template.py
View file @
b9e12416
...
...
@@ -60,13 +60,12 @@ class MockServingChat:
tokenizer
:
MockTokenizer
@
pytest
.
mark
.
asyncio
async
def
test_load_chat_template
():
def
test_load_chat_template
():
# Testing chatml template
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
...
...
@@ -77,8 +76,7 @@ async def test_load_chat_template():
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant
\\
n' }}{% endif %}"""
# noqa: E501
@
pytest
.
mark
.
asyncio
async
def
test_no_load_chat_template_filelike
():
def
test_no_load_chat_template_filelike
():
# Testing chatml template
template
=
"../../examples/does_not_exist"
tokenizer
=
MockTokenizer
()
...
...
@@ -86,35 +84,33 @@ async def test_no_load_chat_template_filelike():
mock_serving_chat
=
MockServingChat
(
tokenizer
)
with
pytest
.
raises
(
ValueError
,
match
=
"looks like a file path"
):
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
@
pytest
.
mark
.
asyncio
async
def
test_no_load_chat_template_literallike
():
def
test_no_load_chat_template_literallike
():
# Testing chatml template
template
=
"{{ messages }}"
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
assert
template_content
==
template
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
async
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
await
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
...
...
tests/async_engine/test_openapi_server_ray.py
View file @
b9e12416
# imports for guided decoding tests
import
os
import
subprocess
import
sys
import
time
import
openai
# use the official client for correctness check
import
pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import
ray
import
requests
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
from
..utils
import
ServerRunner
# any model with a chat template should work here
MODEL_NAME
=
"facebook/opt-125m"
@
ray
.
remote
(
num_gpus
=
1
)
class
ServerRunner
:
def
__init__
(
self
,
args
):
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
[
"python3"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
args
,
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
...
...
@@ -74,7 +29,7 @@ def server():
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"
session
"
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
...
...
@@ -139,8 +94,10 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
0
])
==
5
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
)
==
5
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
...
...
tests/basic_correctness/__init__.py
0 → 100644
View file @
b9e12416
tests/basic_correctness/test_basic_correctness.py
View file @
b9e12416
...
...
@@ -3,9 +3,12 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
import
os
import
weakref
import
pytest
from
vllm
import
LLM
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
...
...
@@ -13,6 +16,16 @@ MODELS = [
VLLM_ATTENTION_BACKEND
=
"VLLM_ATTENTION_BACKEND"
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
llm
=
LLM
(
"facebook/opt-125m"
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
# If there's any circular reference to vllm, this fails
# because llm instance is not GC'ed.
assert
weak_llm
()
is
None
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
...
...
tests/basic_correctness/test_preemption.py
View file @
b9e12416
...
...
@@ -6,6 +6,7 @@ Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
"""
import
pytest
from
prometheus_client
import
REGISTRY
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
...
...
@@ -71,6 +72,7 @@ def test_chunked_prefill_recompute(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption
(
caplog_vllm
,
hf_runner
,
vllm_runner
,
example_prompts
,
...
...
@@ -87,10 +89,13 @@ def test_preemption(
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
num_cumulative_preemption
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
...
...
@@ -100,6 +105,20 @@ def test_preemption(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
assert
(
"is preempted by PreemptionMode.RECOMPUTE mode because there "
"is not enough KV cache space."
in
caplog_vllm
.
text
)
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
preemption_metrics
=
None
for
m
in
REGISTRY
.
collect
():
if
m
.
name
==
"vllm:num_preemptions"
:
preemption_metrics
=
m
assert
preemption_metrics
is
not
None
total_recorded_preemption
=
0
for
sample
in
preemption_metrics
.
samples
:
total_recorded_preemption
+=
sample
.
value
assert
total_preemption
==
total_recorded_preemption
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -107,6 +126,7 @@ def test_preemption(
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap
(
caplog_vllm
,
hf_runner
,
vllm_runner
,
example_prompts
,
...
...
@@ -122,11 +142,18 @@ def test_swap(
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
,
disable_log_stats
=
False
,
)
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
num_cumulative_preemption
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
...
...
@@ -138,6 +165,21 @@ def test_swap(
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"vLLM:
{
vllm_output_ids
}
"
)
assert
(
"is preempted by PreemptionMode.SWAP mode because there "
"is not enough KV cache space."
in
caplog_vllm
.
text
)
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
preemption_metrics
=
None
for
m
in
REGISTRY
.
collect
():
if
m
.
name
==
"vllm:num_preemptions"
:
preemption_metrics
=
m
assert
preemption_metrics
is
not
None
total_recorded_preemption
=
0
for
sample
in
preemption_metrics
.
samples
:
total_recorded_preemption
+=
sample
.
value
assert
total_preemption
==
total_recorded_preemption
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
...
...
tests/conftest.py
View file @
b9e12416
import
contextlib
import
gc
import
os
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
pytest
import
torch
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
AutoProcessor
,
LlavaForConditionalGeneration
)
from
transformers
import
(
AutoModelForCausalLM
,
AutoProcessor
,
AutoTokenizer
,
LlavaConfig
,
LlavaForConditionalGeneration
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
TokenizerPoolConfig
,
VisionLanguageConfig
from
vllm.distributed
import
destroy_model_parallel
from
vllm.inputs
import
PromptInputs
from
vllm.logger
import
init_logger
from
vllm.sequence
import
MultiModalData
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
logger
=
init_logger
(
__name__
)
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
...
...
@@ -129,9 +132,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
"float"
:
torch
.
float
,
}
_VISION_LANGUAGE_MODELS
=
{
"llava-hf/llava-1.5-7b-hf"
:
LlavaForConditionalGeneration
,
}
AutoModelForCausalLM
.
register
(
LlavaConfig
,
LlavaForConditionalGeneration
)
_EMBEDDING_MODELS
=
[
"intfloat/e5-mistral-7b-instruct"
,
]
class
HfRunner
:
...
...
@@ -139,32 +144,44 @@ class HfRunner:
def
__init__
(
self
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
dtype
:
str
=
"half"
,
)
->
None
:
assert
dtype
in
_STR_DTYPE_TO_TORCH_DTYPE
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model_name
=
model_name
if
model_name
not
in
_VISION_LANGUAGE_MODELS
:
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
if
model_name
in
_EMBEDDING_MODELS
:
# Lazy init required for AMD CI
from
sentence_transformers
import
SentenceTransformer
self
.
model
=
SentenceTransformer
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
).
cuda
()
self
.
processor
=
None
device
=
"cpu"
,
).
to
(
dtype
=
torch_dtype
).
cuda
()
else
:
self
.
model
=
_VISION_LANGUAGE_MODELS
[
model_name
]
.
from_pretrained
(
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
).
cuda
()
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
try
:
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
self
.
tokenizer
=
get_tokenizer
(
tokenizer_name
,
trust_remote_code
=
True
)
except
Exception
:
logger
.
warning
(
"Unable to auto-load processor from HuggingFace for "
"model %s. Using tokenizer instead."
,
model_name
)
self
.
processor
=
self
.
tokenizer
def
generate
(
self
,
...
...
@@ -176,19 +193,19 @@ class HfRunner:
if
images
:
assert
len
(
prompts
)
==
len
(
images
)
for
i
,
prompt
in
enumerate
(
prompts
):
if
self
.
model_name
not
in
_VISION_LANGUAGE_MODELS
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
inputs
=
{
"input_ids"
:
input_ids
.
cuda
()
}
els
e
:
image
=
images
[
i
]
if
images
else
None
inputs
=
self
.
processor
(
text
=
prompt
,
images
=
image
,
return_tensors
=
"pt"
)
inputs
=
{
key
:
value
.
cuda
()
if
value
is
not
None
else
None
for
key
,
value
in
inputs
.
items
()
}
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
Non
e
:
processor_kwargs
[
"
image
s"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
{
key
:
value
.
cuda
()
if
value
is
not
None
else
None
for
key
,
value
in
inputs
.
items
()
}
output_ids
=
self
.
model
.
generate
(
**
inputs
,
use_cache
=
True
,
...
...
@@ -272,6 +289,71 @@ class HfRunner:
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
def
generate_greedy_logprobs_limit
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
all_logprobs
=
[]
all_output_ids
=
[]
all_output_strs
=
[]
for
prompt
in
prompts
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
output
=
self
.
model
.
generate
(
input_ids
.
cuda
(),
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
)
seq_logprobs
=
[]
for
_
,
hidden_states
in
enumerate
(
output
.
hidden_states
):
last_hidden_states
=
hidden_states
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
# convert to dict
seq_logprobs_lst
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
for
token_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
output_len
=
seq_ids
.
shape
[
0
]
-
input_ids
.
shape
[
1
]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
torch
.
Tensor
]]:
return
self
.
model
.
encode
(
prompts
)
def
__del__
(
self
):
del
self
.
model
cleanup
()
...
...
@@ -321,12 +403,22 @@ class VllmRunner:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
images
.
shape
[
0
]
req_outputs
=
self
.
model
.
generate
(
prompts
,
sampling_params
=
sampling_params
,
multi_modal_data
=
MultiModalData
(
type
=
MultiModalData
.
Type
.
IMAGE
,
data
=
images
)
if
images
is
not
None
else
None
)
prompt_inputs
:
List
[
PromptInputs
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
image
=
None
if
images
is
None
else
images
[
i
:
i
+
1
]
mm_data
=
None
if
image
is
None
else
MultiModalData
(
type
=
MultiModalData
.
Type
.
IMAGE
,
data
=
image
,
)
prompt_inputs
.
append
({
"prompt"
:
prompt
,
"multi_modal_data"
:
mm_data
,
})
req_outputs
=
self
.
model
.
generate
(
prompt_inputs
,
sampling_params
=
sampling_params
)
outputs
=
[]
for
req_output
in
req_outputs
:
prompt_str
=
req_output
.
prompt
...
...
@@ -397,6 +489,14 @@ class VllmRunner:
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
outputs
=
[]
for
req_output
in
req_outputs
:
embedding
=
req_output
.
outputs
.
embedding
outputs
.
append
(
embedding
)
return
outputs
def
__del__
(
self
):
del
self
.
model
cleanup
()
...
...
@@ -415,3 +515,19 @@ def get_tokenizer_pool_config(tokenizer_group_type):
pool_type
=
"ray"
,
extra_config
=
{})
raise
ValueError
(
f
"Unknown tokenizer_group_type:
{
tokenizer_group_type
}
"
)
@
pytest
.
fixture
()
def
temporary_enable_log_propagate
():
import
logging
logger
=
logging
.
getLogger
(
"vllm"
)
logger
.
propagate
=
True
yield
logger
.
propagate
=
False
@
pytest
.
fixture
()
def
caplog_vllm
(
temporary_enable_log_propagate
,
caplog
):
# To capture vllm log, we should enable propagate=True temporarily
# because caplog depends on logs propagated to the root logger.
yield
caplog
tests/core/block/e2e/__init__.py
0 → 100644
View file @
b9e12416
tests/core/block/e2e/conftest.py
View file @
b9e12416
from
typing
import
Callable
,
Iterable
,
Optional
import
pytest
from
tests.conftest
import
cleanup
from
vllm
import
LLM
from
vllm.model_executor.utils
import
set_random_seed
from
....conftest
import
cleanup
@
pytest
.
fixture
def
baseline_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
...
...
@@ -39,3 +42,27 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
for
llm
in
generator_inner
():
yield
llm
del
llm
def
get_text_from_llm_generator
(
llm_generator
:
Iterable
[
LLM
],
prompts
,
sampling_params
,
llm_cb
:
Optional
[
Callable
[[
LLM
],
None
]]
=
None
):
for
llm
in
llm_generator
:
if
llm_cb
:
llm_cb
(
llm
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
text
=
[
output
.
outputs
[
0
].
text
for
output
in
outputs
]
del
llm
return
text
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
for
llm
in
llm_generator
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
del
llm
return
token_ids
tests/core/block/e2e/test_correctness.py
View file @
b9e12416
...
...
@@ -4,6 +4,8 @@ import pytest
from
vllm
import
SamplingParams
from
.conftest
import
get_token_ids_from_llm_generator
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
...
...
@@ -444,12 +446,3 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
for
llm
in
llm_generator
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
del
llm
return
token_ids
tests/core/block/e2e/test_correctness_sliding_window.py
0 → 100644
View file @
b9e12416
import
random
from
typing
import
List
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
.conftest
import
get_text_from_llm_generator
# relatively small model with 4k sliding window
MODEL
=
"bigcode/starcoder2-3b"
BLOCK_SIZE
=
16
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
MODEL
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"block_size"
:
BLOCK_SIZE
,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_retrival
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
seed
):
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
asks for value of one of them (which is outside the sliding window).
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
Additionally, we compare the results of the v1 and v2 managers.
"""
sampling_params
=
SamplingParams
(
max_tokens
=
1024
,
ignore_eos
=
True
,
temperature
=
0.0
,
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
)
print
(
'Getting token ids from block manager v1'
)
baseline_texts
=
get_text_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
,
llm_cb
=
check_window
(
prompts
))
check_answers
(
indices
,
answer
,
baseline_texts
)
print
(
'Getting token ids from block manager v2'
)
test_texts
=
get_text_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
check_answers
(
indices
,
answer
,
test_texts
)
cmp
=
[
expected_text
==
actual_text
for
expected_text
,
actual_text
in
zip
(
baseline_texts
,
test_texts
)
]
print
(
cmp
)
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
# states that xformers and flash_attn have different ideas about the window
# size anyways
assert
sum
(
cmp
)
>
0.7
*
len
(
cmp
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"model"
:
MODEL
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"block_size"
:
BLOCK_SIZE
,
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"enable_chunked_prefill"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
):
"""
This is similar to test_sliding_window_retrival, however, it doesn't
compare against the v1 block manager since v1 doesn't support
chunked prefill with sliding window.
The results with and without chunked prefill are not the same due to
numerical instabilities.
"""
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
ignore_eos
=
True
,
temperature
=
0.0
,
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
)
# We don't compare with the baseline model here, since the results
# slightly different due to different tailing in attention.
test_texts
=
get_text_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
,
llm_cb
=
check_window
(
prompts
))
check_answers
(
indices
,
answer
,
test_texts
)
def
prep_prompts
(
batch_size
:
int
):
"""
Generate prompts which a bunch of assignments,
then asking for the value of one of them.
The prompt is just under 10k tokens; sliding window is 4k
so the answer is outside sliding window, but should still be correct.
"""
prompts
:
List
[
str
]
=
[]
answer
:
List
[
int
]
=
[]
indices
:
List
[
int
]
=
[]
random
.
seed
(
1
)
for
_
in
range
(
batch_size
):
idx
=
random
.
randint
(
30
,
90
)
indices
.
append
(
idx
)
prompt
=
"```python
\n
# We set a number of variables, "
+
\
f
"x
{
idx
}
will be important later
\n
"
ln
=
random
.
randint
(
800
,
1100
)
for
k
in
range
(
30
,
ln
):
v
=
random
.
randint
(
10
,
99
)
if
k
==
idx
:
answer
.
append
(
v
)
prompt
+=
f
"x
{
k
}
=
{
v
}
\n
"
prompt
+=
f
"# Now, we check the value of x
{
idx
}
:
\n
"
prompt
+=
f
"assert x
{
idx
}
== "
prompts
.
append
(
prompt
)
return
prompts
,
answer
,
indices
def
check_answers
(
indices
:
List
[
int
],
answer
:
List
[
int
],
outputs
:
List
[
str
]):
answer2
=
[
int
(
text
[
0
:
2
].
strip
())
for
text
in
outputs
]
print
(
list
(
zip
(
indices
,
zip
(
answer
,
answer2
))))
numok
=
0
for
a1
,
a2
in
zip
(
answer
,
answer2
):
if
a1
==
a2
:
numok
+=
1
frac_ok
=
numok
/
len
(
answer
)
print
(
f
"Num OK:
{
numok
}
/
{
len
(
answer
)
}
{
frac_ok
}
"
)
assert
frac_ok
>
0.7
def
check_window
(
prompts
:
List
[
str
]):
def
inner
(
llm
:
LLM
):
sliding_window
=
llm
.
llm_engine
.
model_config
.
get_sliding_window
()
assert
sliding_window
and
sliding_window
>
0
assert
any
(
len
(
llm
.
get_tokenizer
().
tokenize
(
prompt
))
>
sliding_window
for
prompt
in
prompts
)
return
inner
tests/core/block/test_block_manager_v2.py
View file @
b9e12416
import
pytest
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager_v2
import
BlockSpaceManagerV2
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
from
..utils
import
create_seq_group
from
..utils
import
create_seq_group
,
create_seq_group_encoder_decoder
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
...
...
@@ -52,6 +54,156 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
assert
can_allocate_result
==
AllocStatus
.
LATER
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
16
,
80
,
160
])
@
pytest
.
mark
.
parametrize
(
"num_seqs_per_group"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_seq_group_encoder_decoder
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
watermark
=
watermark
,
)
num_watermark_blocks
=
int
(
watermark
*
num_gpu_blocks
)
num_output_blocks_per_seq
=
1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks
=
num_output_blocks_per_seq
for
bdx
,
num_prompt_blocks
in
enumerate
(
range
(
1
,
num_gpu_blocks
-
num_output_blocks
)):
num_cross_blocks_per_seq
=
num_prompt_blocks
seq_group
=
create_seq_group_encoder_decoder
(
seq_prompt_len
=
block_size
*
num_prompt_blocks
,
seq_output_lens
=
[
block_size
*
num_output_blocks_per_seq
for
_
in
range
(
num_seqs_per_group
)
],
request_id
=
str
(
bdx
))
assert
num_prompt_blocks
+
num_output_blocks
<=
num_gpu_blocks
can_allocate_result
=
block_manager
.
can_allocate
(
seq_group
)
num_required_blocks
=
num_prompt_blocks
+
\
num_output_blocks
+
\
num_cross_blocks_per_seq
if
num_gpu_blocks
-
num_required_blocks
<
num_watermark_blocks
:
assert
can_allocate_result
==
AllocStatus
.
NEVER
elif
num_gpu_blocks
>=
num_required_blocks
:
assert
can_allocate_result
==
AllocStatus
.
OK
else
:
assert
can_allocate_result
==
AllocStatus
.
LATER
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_seqs_per_group"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_encoder_decoder_fails_with_swa
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
'''
SWA short for Sliding Window Attention.
At time of writing block manager v2 does not support SWA.
However even when SWA is implemented for block manager v2,
there will still most likely be a separate workstream required
to enable SWA for encoder/decoder models.
Therefore this test enforces that one of the following cases
hold true:
1. Block manager v2 does not support SWA at all (true at time of writing)
2. Block manager v2 fails with NotImplementError when SWA is enabled
AND a SequenceGroup with an encoder sequence (i.e. in support of an
encoder/decoder model) is passed into can_allocate() as an argument
The setup for this test is stripped down version of
test_can_allocate_seq_group_encoder_decoder()
'''
with
pytest
.
raises
((
NotImplementedError
,
AssertionError
))
as
exc_info
:
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
watermark
=
watermark
,
sliding_window
=
5
# SWA
)
num_output_blocks_per_seq
=
1
num_prompt_blocks
=
1
num_output_blocks
=
num_output_blocks_per_seq
seq_group
=
create_seq_group_encoder_decoder
(
seq_prompt_len
=
block_size
*
num_prompt_blocks
,
seq_output_lens
=
[
block_size
*
num_output_blocks_per_seq
for
_
in
range
(
num_seqs_per_group
)
],
request_id
=
"0"
)
assert
num_prompt_blocks
+
num_output_blocks
<=
num_gpu_blocks
block_manager
.
can_allocate
(
seq_group
)
# Assert that either
# 1. Block manager v2 constructor fails with assertion that sliding window
# is not yet supported (most likely near-term outcome at time of
# writing), or
# 2. can_allocate() fails with NotImplementedError due to combination of
# encoder/decoder and sliding window attention
if
isinstance
(
exc_info
.
value
,
NotImplementedError
):
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
elif
isinstance
(
exc_info
.
value
,
AssertionError
):
assert
str
(
exc_info
.
value
)
==
"Sliding window not yet supported"
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_seqs_per_group"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_encoder_decoder_fails_with_prefix_cache
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
watermark
=
watermark
,
enable_caching
=
True
# Prefix cache
)
num_output_blocks_per_seq
=
1
num_prompt_blocks
=
1
num_output_blocks
=
num_output_blocks_per_seq
seq_group
=
create_seq_group_encoder_decoder
(
seq_prompt_len
=
block_size
*
num_prompt_blocks
,
seq_output_lens
=
[
block_size
*
num_output_blocks_per_seq
for
_
in
range
(
num_seqs_per_group
)
],
request_id
=
"0"
)
assert
num_prompt_blocks
+
num_output_blocks
<=
num_gpu_blocks
# Assert that either can_allocate() fails with NotImplementedError
# due to combination of encoder/decoder and prefix cache
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
1
,
7
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
1
,
8
,
129
])
...
...
@@ -101,3 +253,72 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
))
-
len
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
))
assert
num_consumed_blocks
==
expected_consumed_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
,
16
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
10
,
300
,
1000
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
50
])
@
pytest
.
mark
.
parametrize
(
"sliding_window"
,
[
20
,
32
,
200
,
512
])
def
test_sliding_window
(
block_size
,
prompt_len
,
num_slots_to_append
,
sliding_window
):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks
=
1024
watermark
=
0.1
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
watermark
=
watermark
,
sliding_window
=
sliding_window
,
)
def
check_used
(
min_n
,
max_n
=
None
):
if
max_n
is
None
:
max_n
=
min_n
used
=
num_gpu_blocks
-
block_manager
.
get_num_free_gpu_blocks
()
#print("check", min_n, used, max_n)
assert
min_n
<=
used
assert
used
<=
max_n
def
num_blocks
(
num_tokens
):
return
(
num_tokens
+
block_size
-
1
)
//
block_size
check_used
(
0
)
seq_group
=
create_seq_group
(
seq_prompt_len
=
prompt_len
,
seq_output_lens
=
[
0
],
)
check_used
(
0
)
# Allocate seq
assert
block_manager
.
can_allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
check_used
(
num_blocks
(
prompt_len
))
# Seq seq to RUNNING
seq
=
seq_group
.
get_seqs
()[
0
]
seq
.
status
=
SequenceStatus
.
RUNNING
seq
.
data
.
update_num_computed_tokens
(
prompt_len
)
check_used
(
num_blocks
(
prompt_len
))
# this is how we compute it in BlockSpaceManagerV2.__init__
sliding_blocks
=
(
sliding_window
//
block_size
)
+
2
# plus one block for null block
sliding_blocks
+=
1
# Append tokens to the sequeqnce
for
token_id
in
range
(
num_slots_to_append
):
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
seq
.
data
.
update_num_computed_tokens
(
1
)
block_manager
.
append_slots
(
seq
,
num_lookahead_slots
=
0
)
if
prompt_len
<
sliding_window
+
10
:
check_used
(
0
,
sliding_blocks
+
1
)
else
:
check_used
(
sliding_blocks
,
sliding_blocks
+
1
)
tests/core/block/test_block_table.py
View file @
b9e12416
...
...
@@ -410,8 +410,7 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
expected_src
=
static_block_table
.
physical_block_ids
[
cow_block_id
]
expected_dst
=
appender_block_table
.
physical_block_ids
[
cow_block_id
]
assert
expected_src
in
cows
assert
expected_dst
in
cows
[
expected_src
]
assert
(
expected_src
,
expected_dst
)
in
cows
else
:
# Otherwise, there should be no copy-on-write.
assert
not
cows
...
...
@@ -490,8 +489,7 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
expected_src
=
static_block_table
.
physical_block_ids
[
cow_block_id
]
expected_dst
=
appender_block_table
.
physical_block_ids
[
cow_block_id
]
assert
expected_src
in
cows
assert
expected_dst
in
cows
[
expected_src
]
assert
(
expected_src
,
expected_dst
)
in
cows
static_block_table
.
free
()
appender_block_table
.
free
()
...
...
tests/core/block/test_prefix_caching_block.py
View file @
b9e12416
...
...
@@ -410,6 +410,123 @@ class TestPrefixCachingBlockAllocator:
assert
(
len
(
res
)
==
zero_point_blocks
)
# Test case that assume those prompted block after first immutable would
# be freed into hashless allocator, while first immutable block get ref
# increased.
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
3
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
def
test_alloc_promotion
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
random
.
seed
(
seed
)
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
token_ids
=
list
(
range
(
block_size
))
block
=
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
)
assert
allocator
.
_refcounter
.
get
(
block
.
block_id
)
==
1
m
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
block_id
=
m
.
block_id
for
i
in
range
(
block_size
):
m
.
append_token_ids
([
i
])
# After block get promoted to immutable from mutable, if there is
# already same content hash block, then it shall be released into
# hashless_allocator
# And first immutable block's ref get increased by 1
assert
m
.
block_id
==
block
.
block_id
assert
block_id
in
allocator
.
_hashless_allocator
.
_free_block_indices
assert
allocator
.
_refcounter
.
get
(
block
.
block_id
)
==
2
# Test case when eviction and allocation are mixed,
# make sure they work as expected
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
3
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
def
test_eviction_alloc_mixed
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
random
.
seed
(
seed
)
all_blocks_list
=
[
i
for
i
in
range
(
num_blocks
)]
zero_ref
=
{
i
:
0
for
i
in
range
(
num_blocks
)}
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
token_ids
=
list
(
range
(
num_blocks
*
block_size
))
# now we have num_blocks free blocks in hashless allocator
# with internal tracking list _blocks _cached_blocks and evictor
# empty and block's ref shall be 0
assert
list
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
all_blocks_list
assert
len
(
allocator
.
_blocks
.
keys
())
==
0
assert
len
(
allocator
.
_cached_blocks
.
values
())
==
0
assert
len
(
allocator
.
evictor
.
free_table
.
keys
())
==
0
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
# Allocate immutable chains with only one block residuled in
new_block
=
[]
for
i
in
range
(
num_blocks
):
block
=
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
[
block_size
*
i
:
block_size
*
(
i
+
1
)])
new_block
.
append
(
block
)
# Free all blocks, and now all blocks shall be in the evictor
# there shall be no tracking data left in _blocks
# all blocks shall be tracked in _cached_blocks
# all blocks' ref shall be zero
for
block
in
new_block
:
allocator
.
free
(
block
)
assert
len
(
allocator
.
_blocks
.
keys
())
==
0
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
assert
list
(
allocator
.
_cached_blocks
.
values
())
==
all_blocks_list
assert
list
(
allocator
.
evictor
.
free_table
.
keys
())
==
all_blocks_list
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
# Allocate a mutable block, and the first block shall be evicted
# and set its content hash into None, ref to 1
mutable
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
assert
mutable
.
block_id
==
0
assert
mutable
.
content_hash
is
None
assert
0
in
allocator
.
_blocks
assert
allocator
.
_refcounter
.
get
(
0
)
==
1
assert
0
not
in
allocator
.
_cached_blocks
assert
0
not
in
allocator
.
evictor
# Since this mutable block has no hash yet, it shall be released into
# hashless allocator
allocator
.
free
(
mutable
)
assert
len
(
allocator
.
_blocks
.
keys
())
==
0
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
assert
0
not
in
allocator
.
_cached_blocks
assert
0
not
in
allocator
.
evictor
assert
0
in
allocator
.
_hashless_allocator
.
_free_block_indices
# when allocate immutable with first block_size tokens, we
# shall get free block from hashless allocator, thus no block left
# in hashless
block
=
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
[:
block_size
])
assert
block
.
block_id
==
0
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
assert
0
in
allocator
.
_blocks
assert
0
in
allocator
.
_cached_blocks
.
values
()
assert
allocator
.
_refcounter
.
get
(
0
)
==
1
assert
0
not
in
allocator
.
evictor
# allocate mutable block again, it shall be popped from evictor
mutable
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
assert
mutable
.
block_id
not
in
allocator
.
evictor
.
free_table
assert
allocator
.
_refcounter
.
get
(
mutable
.
block_id
)
==
1
# Test case where two last accessed times are equal
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
...
...
tests/core/test_block_manager.py
View file @
b9e12416
import
time
from
collections
import
defaultdict
from
typing
import
List
import
pytest
from
vllm
import
SamplingParams
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager_v1
import
(
BlockSpaceManagerV1
,
UncachedBlockAllocator
)
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
from
.utils
import
create_dummy_prompt
from
.utils
import
create_dummy_prompt
,
create_dummy_prompt_encoder_decoder
def
test_block_allocator_allocate
():
...
...
@@ -72,7 +75,7 @@ def test_allocate():
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
...
...
@@ -84,11 +87,107 @@ def test_allocate():
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
(
num_gpu_blocks
-
1
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
((
num_gpu_blocks
-
1
)
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder_fails_with_swa
():
# SWA short for sliding window attention
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
sliding_window
=
5
)
# swa
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
# Assert that allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
def
test_allocate_encoder_decoder_fails_with_prefix_caching
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
True
)
# Prefix cache
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
# Assert that allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
def
test_append_slot_single_seq
():
block_size
=
4
num_cpu_blocks
=
4
...
...
@@ -132,8 +231,10 @@ def test_append_slot_cow():
# Allocate prompt to gpu block. There is one slot left in the block.
prompt
=
Sequence
(
seq_id
=
1
,
prompt
=
"one two three"
,
prompt_token_ids
=
[
1
,
2
,
3
],
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
1
,
2
,
3
],
},
block_size
=
block_size
)
# Fork the sequence, such that a COW will be required when we append a new
...
...
@@ -141,8 +242,10 @@ def test_append_slot_cow():
child
=
prompt
.
fork
(
new_seq_id
=
2
)
# Allocate space for the sequence group.
seq_group
=
SequenceGroup
(
"1"
,
[
prompt
,
child
],
SamplingParams
(),
time
.
time
(),
time
.
perf_counter
)
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
prompt
,
child
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
())
block_manager
.
allocate
(
seq_group
)
# Fork and append a new token id. We expect a COW to be scheduled.
...
...
@@ -155,7 +258,10 @@ def test_append_slot_cow():
cows
=
block_manager
.
append_slots
(
child
)
assert
cows
for
src_block
,
dst_blocks
in
cows
.
items
():
dict_cows
=
defaultdict
(
list
)
for
src_block
,
dst_block
in
cows
:
dict_cows
[
src_block
].
append
(
dst_block
)
for
src_block
,
dst_blocks
in
dict_cows
.
items
():
assert
src_block
not
in
dst_blocks
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
...
...
@@ -215,7 +321,7 @@ def test_swap():
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
list
(
mapping
.
keys
())
==
gpu_blocks
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
...
...
@@ -228,7 +334,63 @@ def test_swap():
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
list
(
mapping
.
keys
())
==
cpu_blocks
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_swap_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
decoder_prompt
.
status
=
SequenceStatus
.
WAITING
encoder_prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
decoder_prompt
.
status
=
SequenceStatus
.
RUNNING
decoder_prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap encoder/decoder seq group from GPU -> CPU.
decoder_gpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_gpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
gpu_blocks
=
decoder_gpu_blocks
+
cross_gpu_blocks
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
#assert list(mapping.keys()) == gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
decoder_prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap encoder/decoder seq group from CPU -> GPU.
decoder_cpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_cpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
cpu_blocks
=
decoder_cpu_blocks
+
cross_cpu_blocks
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
...
...
@@ -259,6 +421,41 @@ def test_free():
block_manager
.
get_block_table
(
prompt
)
def
test_free_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
decoder_prompt_blocks
=
len
(
block_manager
.
get_block_table
(
decoder_prompt
))
encoder_prompt_blocks
=
len
(
block_manager
.
get_cross_block_table
(
seq_group
))
prompt_blocks
=
decoder_prompt_blocks
+
encoder_prompt_blocks
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
decoder_prompt
)
block_manager
.
free_cross
(
seq_group
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
decoder_prompt
)
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
encoder_prompt
)
def
test_reset
():
block_size
=
4
num_cpu_blocks
=
4
...
...
@@ -280,6 +477,31 @@ def test_reset():
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_reset_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
f
"
{
i
}
"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_sliding_window_multi_seq
():
"""
Tests that memory allocation and deallocation is handled
...
...
@@ -298,9 +520,17 @@ def test_sliding_window_multi_seq():
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
parent
=
Sequence
(
1
,
"one two three"
,
[
0
,
1
,
2
],
block_size
)
seq_group
=
SequenceGroup
(
"1"
,
[
parent
],
SamplingParams
(),
time
.
time
(),
None
)
parent
=
Sequence
(
seq_id
=
1
,
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
0
,
1
,
2
],
},
block_size
=
block_size
)
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
parent
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(),
lora_request
=
None
)
block_manager
.
allocate
(
seq_group
)
# assert the number of blocks allocated is correct
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
b9e12416
...
...
@@ -355,8 +355,8 @@ def test_swap():
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
!=
[]
assert
out
.
blocks_to_swap_in
==
[]
# Add 1 more task. Swap should be prioritized over new prefill.
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
...
...
@@ -365,8 +365,8 @@ def test_swap():
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
!=
[]
assert
out
.
blocks_to_swap_out
==
[]
def
test_running_prefill_prioritized_over_swap
():
...
...
@@ -406,8 +406,8 @@ def test_running_prefill_prioritized_over_swap():
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
!=
[]
assert
out
.
blocks_to_swap_in
==
[]
# Add 1 more task. Swap is not possible, so prefill is running.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
...
...
@@ -419,8 +419,8 @@ def test_running_prefill_prioritized_over_swap():
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
==
[]
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
# Now although swap is possible, running prefill is prioritized.
...
...
@@ -429,8 +429,8 @@ def test_running_prefill_prioritized_over_swap():
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
==
[]
assert
out
.
blocks_to_swap_out
==
[]
assert
not
seq_group2
.
is_prefill
()
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
append_new_token
(
seq_group2
,
1
)
...
...
@@ -440,8 +440,8 @@ def test_running_prefill_prioritized_over_swap():
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
1
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
==
[]
assert
out
.
blocks_to_swap_out
==
[]
assert
not
seq_group2
.
is_prefill
()
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
append_new_token
(
seq_group2
,
1
)
...
...
@@ -451,8 +451,8 @@ def test_running_prefill_prioritized_over_swap():
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_batched_tokens
==
30
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
!=
[]
assert
out
.
blocks_to_swap_out
==
[]
def
test_chunked_prefill_preempt
():
...
...
@@ -493,8 +493,8 @@ def test_chunked_prefill_preempt():
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
0
assert
out
.
num_batched_tokens
==
0
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_in
==
[]
# Make sure we can reschedule preempted request.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
...
...
tests/core/test_scheduler.py
View file @
b9e12416
...
...
@@ -180,6 +180,7 @@ def test_scheduler_schedule_preempt_abort():
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
2
assert
out
.
preempted
==
1
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
scheduler
.
abort_seq_group
(
"1"
)
...
...
@@ -293,8 +294,8 @@ def test_swapped_out_prioritized():
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
2
assert
out
.
num_batched_tokens
==
2
assert
out
.
blocks_to_swap_out
!=
{}
assert
out
.
blocks_to_swap_in
==
{}
assert
out
.
blocks_to_swap_out
!=
[]
assert
out
.
blocks_to_swap_in
==
[]
append_new_token
(
out
,
1
)
# Add 1 more task. Swap should be prioritized over prefill.
...
...
@@ -305,8 +306,8 @@ def test_swapped_out_prioritized():
assert
len
(
out
.
scheduled_seq_groups
)
==
3
# 3 decodes. It is swapped in.
assert
out
.
num_batched_tokens
==
3
assert
out
.
blocks_to_swap_in
!=
{}
assert
out
.
blocks_to_swap_out
==
{}
assert
out
.
blocks_to_swap_in
!=
[]
assert
out
.
blocks_to_swap_out
==
[]
def
initialize_scheduler
(
*
,
...
...
@@ -566,9 +567,9 @@ def test_decode_schedule_preempted():
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
# assert budget.num_curr_seqs == 1
# Both should be preempted, not swapped.
assert
output
.
blocks_to_swap_out
==
{}
assert
output
.
blocks_to_swap_out
==
[]
# Nothing is copied.
assert
output
.
blocks_to_copy
==
{}
assert
output
.
blocks_to_copy
==
[]
def
test_decode_swap_beam_search
():
...
...
@@ -599,7 +600,7 @@ def test_decode_swap_beam_search():
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
scheduler
.
block_manager
.
swap_out
=
MagicMock
()
expected_swap_mapping
=
{
"5"
:
"7"
}
expected_swap_mapping
=
[(
"5"
,
"7"
)]
scheduler
.
block_manager
.
swap_out
.
return_value
=
expected_swap_mapping
remainig_running
,
output
=
scheduler
.
_schedule_running
(
...
...
@@ -618,7 +619,7 @@ def test_decode_swap_beam_search():
# Both should be preempted, not swapped.
assert
output
.
blocks_to_swap_out
==
expected_swap_mapping
# Nothing is copied.
assert
output
.
blocks_to_copy
==
{}
assert
output
.
blocks_to_copy
==
[]
def
test_schedule_decode_blocks_to_copy_update
():
...
...
@@ -636,7 +637,7 @@ def test_schedule_decode_blocks_to_copy_update():
# The last request should be swapped out.
scheduler
.
block_manager
.
append_slots
=
MagicMock
()
scheduler
.
block_manager
.
append_slots
.
return_value
=
{
2
:
[
3
]}
scheduler
.
block_manager
.
append_slots
.
return_value
=
[(
2
,
3
)]
budget
=
create_token_budget
()
remaining_running
,
output
=
scheduler
.
_schedule_running
(
...
...
@@ -647,10 +648,10 @@ def test_schedule_decode_blocks_to_copy_update():
assert
len
(
output
.
preempted
)
==
0
assert
len
(
output
.
swapped_out
)
==
0
# Nothing is preempted.
assert
output
.
blocks_to_swap_out
==
{}
assert
output
.
blocks_to_swap_out
==
[]
# Since append_slot returns the source -> dist mapping, it should
# applied.
assert
output
.
blocks_to_copy
==
{
2
:
[
3
]}
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
def
test_schedule_swapped_simple
():
...
...
@@ -658,7 +659,7 @@ def test_schedule_swapped_simple():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
...
@@ -674,9 +675,9 @@ def test_schedule_swapped_simple():
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
# swap in is the reverse of swap out
blocks_to_swap_in_reverse
=
{}
for
swapin
,
swapout
in
output
.
blocks_to_swap_in
.
items
()
:
blocks_to_swap_in_reverse
[
swapout
]
=
swapin
blocks_to_swap_in_reverse
=
[]
for
swapin
,
swapout
in
output
.
blocks_to_swap_in
:
blocks_to_swap_in_reverse
.
append
((
swapout
,
swapin
))
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
...
...
@@ -685,7 +686,7 @@ def test_schedule_swapped_max_token_budget():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -719,7 +720,7 @@ def test_schedule_swapped_max_seqs():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -752,7 +753,7 @@ def test_schedule_swapped_max_loras():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
set
()
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -781,7 +782,7 @@ def test_schedule_swapped_cannot_swap_in():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -808,7 +809,7 @@ def test_infeasible_swap():
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -839,13 +840,13 @@ def test_schedule_swapped_blocks_to_copy():
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
=
{}
blocks_to_swap_out
=
[]
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
append_slots
=
MagicMock
()
scheduler
.
block_manager
.
append_slots
.
return_value
=
{
2
:
[
3
]}
scheduler
.
block_manager
.
append_slots
.
return_value
=
[(
2
,
3
)]
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
...
...
@@ -853,7 +854,7 @@ def test_schedule_swapped_blocks_to_copy():
assert
len
(
remaining_swapped
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
output
.
blocks_to_copy
==
{
2
:
[
3
]}
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
def
test_scheduling_budget
():
...
...
tests/core/utils.py
View file @
b9e12416
...
...
@@ -21,15 +21,69 @@ def create_dummy_prompt(
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt
=
Sequence
(
int
(
request_id
),
prompt_str
,
prompt_tokens
,
block_size
)
seq_group
=
SequenceGroup
(
request_id
,
[
prompt
],
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
time
.
time
(),
lora_request
)
prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
prompt_str
,
"prompt_token_ids"
:
prompt_tokens
,
},
block_size
=
block_size
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
[
prompt
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
lora_request
=
lora_request
)
return
prompt
,
seq_group
def
create_dummy_prompt_encoder_decoder
(
request_id
:
str
,
decoder_prompt_length
:
int
,
encoder_prompt_length
:
int
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
decoder_prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
decoder_prompt_tokens
=
list
(
range
(
decoder_prompt_length
))
decoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
decoder_prompt_tokens
])
decoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
decoder_prompt_str
,
"prompt_token_ids"
:
decoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
encoder_prompt_tokens
=
list
(
reversed
(
list
(
range
(
encoder_prompt_length
))))
encoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
encoder_prompt_tokens
])
encoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
encoder_prompt_str
,
"prompt_token_ids"
:
encoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
[
decoder_prompt
],
sampling_params
=
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
arrival_time
=
time
.
time
(),
lora_request
=
lora_request
,
encoder_seq
=
encoder_prompt
)
return
decoder_prompt
,
encoder_prompt
,
seq_group
def
create_seq_group
(
seq_prompt_len
:
int
=
1024
,
seq_output_lens
:
Iterable
[
int
]
=
(
128
,
),
...
...
@@ -48,8 +102,7 @@ def create_seq_group(
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
prompt
=
""
,
prompt_token_ids
=
prompt_token_ids
,
inputs
=
{
"prompt_token_ids"
:
prompt_token_ids
},
block_size
=
16
,
)
...
...
@@ -70,5 +123,56 @@ def create_seq_group(
return
seq_group
def
create_seq_group_encoder_decoder
(
seq_prompt_len
:
int
=
1024
,
seq_output_lens
:
Iterable
[
int
]
=
(
128
,
),
request_id
:
str
=
'0'
,
seq_id_start
:
int
=
0
,
sampling_params
:
Optional
[
SamplingParams
]
=
None
)
->
SequenceGroup
:
assert
len
(
seq_output_lens
)
>
0
if
sampling_params
is
None
:
sampling_params
=
SamplingParams
()
prompt_token_ids
=
[
0
]
*
seq_prompt_len
seqs
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
for
i
in
range
(
output_len
):
seq
.
append_token_id
(
token_id
=
i
,
logprobs
=
{
i
:
Logprob
(
0.0
)},
)
seqs
.
append
(
seq
)
# Encoder sequence
encoder_seq
=
Sequence
(
seq_id
=
seq_id_start
+
len
(
seq_output_lens
),
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
return
SequenceGroup
(
request_id
=
request_id
,
seqs
=
seqs
,
sampling_params
=
sampling_params
,
arrival_time
=
time
.
time
(),
encoder_seq
=
encoder_seq
)
def
round_up_to_next_block
(
seq_len
:
int
,
block_size
:
int
)
->
int
:
return
(
seq_len
+
block_size
-
1
)
//
block_size
return
(
seq_len
+
block_size
-
1
)
//
block_size
\ No newline at end of file
tests/distributed/__init__.py
0 → 100644
View file @
b9e12416
tests/distributed/test_basic_distributed_correctness.py
View file @
b9e12416
...
...
@@ -4,10 +4,12 @@ by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
cd $VLLM_PATH/tests
TEST_DIST_MODEL=facebook/opt-125m pytest
\
test_basic_distributed_correctness.py
distributed/
test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf
\
test_basic_distributed_correctness.py
distributed/
test_basic_distributed_correctness.py
```
"""
import
os
...
...
@@ -18,6 +20,7 @@ import torch
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
]
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
VLLM_ATTENTION_BACKEND
=
"VLLM_ATTENTION_BACKEND"
...
...
@@ -34,19 +37,21 @@ def test_models(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
enforce_eager
=
False
distributed_executor_backend
=
os
.
getenv
(
DISTRIBUTED_EXECUTOR_BACKEND
)
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
if
backend_by_env_var
==
"FLASHINFER"
:
enforce_eager
=
True
enforce_eager
=
backend_by_env_var
==
"FLASHINFER"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
,
distributed_executor_backend
=
distributed_executor_backend
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
...
...
tests/distributed/test_chunked_prefill_distributed.py
View file @
b9e12416
...
...
@@ -19,6 +19,7 @@ import torch
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
]
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
...
...
@@ -36,6 +37,8 @@ def test_models(
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
)
->
None
:
distributed_executor_backend
=
os
.
getenv
(
DISTRIBUTED_EXECUTOR_BACKEND
)
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
...
...
@@ -53,6 +56,7 @@ def test_models(
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment