Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7c4f76e3
Commit
7c4f76e3
authored
Apr 15, 2024
by
zhuwenwen
Browse files
merge v0.4.0
parents
2da0dd3e
51c31bc1
Changes
332
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2273 additions
and
86 deletions
+2273
-86
tests/conftest.py
tests/conftest.py
+156
-13
tests/core/__init__.py
tests/core/__init__.py
+0
-0
tests/core/block/__init__.py
tests/core/block/__init__.py
+0
-0
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+56
-0
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+86
-0
tests/core/block/test_block_space_manager.py
tests/core/block/test_block_space_manager.py
+50
-0
tests/core/block/test_block_table.py
tests/core/block/test_block_table.py
+500
-0
tests/core/block/test_common.py
tests/core/block/test_common.py
+42
-0
tests/core/block/test_cpu_gpu_block_allocator.py
tests/core/block/test_cpu_gpu_block_allocator.py
+93
-0
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+102
-0
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+384
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+367
-0
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+209
-0
tests/core/utils.py
tests/core/utils.py
+64
-0
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+13
-4
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+24
-6
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+1
-1
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+92
-0
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+34
-0
tests/engine/test_detokenize.py
tests/engine/test_detokenize.py
+0
-62
No files found.
tests/conftest.py
View file @
7c4f76e3
import
contextlib
import
gc
import
os
import
os
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
import
pytest
import
pytest
import
torch
import
torch
from
transformers
import
AutoModelForCausalLM
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
AutoProcessor
,
LlavaForConditionalGeneration
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
TokenizerPoolConfig
,
VisionLanguageConfig
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
)
from
vllm.sequence
import
MultiModalData
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
_TEST_DIR
=
os
.
path
.
dirname
(
__file__
)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
# Multi modal related
_PIXEL_VALUES_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign_pixel_values.pt"
,
"cherry_blossom_pixel_values.pt"
]
]
_IMAGE_FEATURES_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign_image_features.pt"
,
"cherry_blossom_image_features.pt"
]
]
_IMAGE_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign.jpg"
,
"cherry_blossom.jpg"
]
]
_IMAGE_PROMPTS
=
[
"<image>
\n
USER: What's the content of the image?
\n
ASSISTANT:"
,
"<image>
\n
USER: What is the season?
\n
ASSISTANT:"
]
assert
len
(
_PIXEL_VALUES_FILES
)
==
len
(
_IMAGE_FEATURES_FILES
)
==
len
(
_IMAGE_FILES
)
==
len
(
_IMAGE_PROMPTS
)
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
with
open
(
filename
,
"r"
)
as
f
:
with
open
(
filename
,
"r"
)
as
f
:
...
@@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]:
...
@@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]:
return
prompts
return
prompts
def
cleanup
():
destroy_model_parallel
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
yield
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
hf_image_prompts
()
->
List
[
str
]:
return
_IMAGE_PROMPTS
@
pytest
.
fixture
(
scope
=
"session"
)
def
hf_images
()
->
List
[
Image
.
Image
]:
return
[
Image
.
open
(
filename
)
for
filename
in
_IMAGE_FILES
]
@
pytest
.
fixture
()
def
vllm_images
(
request
)
->
"torch.Tensor"
:
vision_language_config
=
request
.
getfixturevalue
(
"model_and_config"
)[
1
]
all_images
=
[]
if
vision_language_config
.
image_input_type
==
(
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
):
filenames
=
_IMAGE_FEATURES_FILES
else
:
filenames
=
_PIXEL_VALUES_FILES
for
filename
in
filenames
:
all_images
.
append
(
torch
.
load
(
filename
))
return
torch
.
concat
(
all_images
,
dim
=
0
)
@
pytest
.
fixture
()
def
vllm_image_prompts
(
request
)
->
List
[
str
]:
vision_language_config
=
request
.
getfixturevalue
(
"model_and_config"
)[
1
]
return
[
"<image>"
*
(
vision_language_config
.
image_feature_size
-
1
)
+
p
for
p
in
_IMAGE_PROMPTS
]
@
pytest
.
fixture
@
pytest
.
fixture
def
example_prompts
()
->
List
[
str
]:
def
example_prompts
()
->
List
[
str
]:
prompts
=
[]
prompts
=
[]
...
@@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
...
@@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
"float"
:
torch
.
float
,
"float"
:
torch
.
float
,
}
}
_VISION_LANGUAGE_MODELS
=
{
"llava-hf/llava-1.5-7b-hf"
:
LlavaForConditionalGeneration
,
}
class
HfRunner
:
class
HfRunner
:
...
@@ -52,11 +131,24 @@ class HfRunner:
...
@@ -52,11 +131,24 @@ class HfRunner:
)
->
None
:
)
->
None
:
assert
dtype
in
_STR_DTYPE_TO_TORCH_DTYPE
assert
dtype
in
_STR_DTYPE_TO_TORCH_DTYPE
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
self
.
model_name
=
model_name
model_name
,
if
model_name
not
in
_VISION_LANGUAGE_MODELS
:
torch_dtype
=
torch_dtype
,
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
trust_remote_code
=
True
,
model_name
,
).
cuda
()
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
).
cuda
()
self
.
processor
=
None
else
:
self
.
model
=
_VISION_LANGUAGE_MODELS
[
model_name
].
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
).
cuda
()
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
)
if
tokenizer_name
is
None
:
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
tokenizer_name
=
model_name
self
.
tokenizer
=
get_tokenizer
(
tokenizer_name
,
trust_remote_code
=
True
)
self
.
tokenizer
=
get_tokenizer
(
tokenizer_name
,
trust_remote_code
=
True
)
...
@@ -64,13 +156,28 @@ class HfRunner:
...
@@ -64,13 +156,28 @@ class HfRunner:
def
generate
(
def
generate
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
:
List
[
Tuple
[
List
[
int
],
str
]]
=
[]
outputs
:
List
[
Tuple
[
List
[
int
],
str
]]
=
[]
for
prompt
in
prompts
:
if
images
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
assert
len
(
prompts
)
==
len
(
images
)
for
i
,
prompt
in
enumerate
(
prompts
):
if
self
.
model_name
not
in
_VISION_LANGUAGE_MODELS
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
inputs
=
{
"input_ids"
:
input_ids
.
cuda
()}
else
:
image
=
images
[
i
]
if
images
else
None
inputs
=
self
.
processor
(
text
=
prompt
,
images
=
image
,
return_tensors
=
"pt"
)
inputs
=
{
key
:
value
.
cuda
()
if
value
is
not
None
else
None
for
key
,
value
in
inputs
.
items
()
}
output_ids
=
self
.
model
.
generate
(
output_ids
=
self
.
model
.
generate
(
input
_ids
.
cuda
()
,
**
input
s
,
use_cache
=
True
,
use_cache
=
True
,
**
kwargs
,
**
kwargs
,
)
)
...
@@ -87,10 +194,12 @@ class HfRunner:
...
@@ -87,10 +194,12 @@ class HfRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
"torch.Tensor"
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
)
max_new_tokens
=
max_tokens
,
images
=
images
)
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
output_ids
,
output_str
=
outputs
[
i
]
output_ids
,
output_str
=
outputs
[
i
]
outputs
[
i
]
=
(
output_ids
[
0
],
output_str
[
0
])
outputs
[
i
]
=
(
output_ids
[
0
],
output_str
[
0
])
...
@@ -150,6 +259,10 @@ class HfRunner:
...
@@ -150,6 +259,10 @@ class HfRunner:
all_logprobs
.
append
(
seq_logprobs
)
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
return
all_logprobs
def
__del__
(
self
):
del
self
.
model
cleanup
()
@
pytest
.
fixture
@
pytest
.
fixture
def
hf_runner
():
def
hf_runner
():
...
@@ -162,9 +275,14 @@ class VllmRunner:
...
@@ -162,9 +275,14 @@ class VllmRunner:
self
,
self
,
model_name
:
str
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_name
:
Optional
[
str
]
=
None
,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
max_model_len
=
1024
,
dtype
:
str
=
"half"
,
dtype
:
str
=
"half"
,
disable_log_stats
:
bool
=
True
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
,
enable_chunked_prefill
:
bool
=
False
,
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
self
.
model
=
LLM
(
self
.
model
=
LLM
(
...
@@ -175,6 +293,9 @@ class VllmRunner:
...
@@ -175,6 +293,9 @@ class VllmRunner:
swap_space
=
0
,
swap_space
=
0
,
disable_log_stats
=
disable_log_stats
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
max_model_len
=
max_model_len
,
block_size
=
block_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
**
kwargs
,
**
kwargs
,
)
)
...
@@ -182,9 +303,16 @@ class VllmRunner:
...
@@ -182,9 +303,16 @@ class VllmRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
"torch.Tensor"
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
req_outputs
=
self
.
model
.
generate
(
prompts
,
if
images
is
not
None
:
sampling_params
=
sampling_params
)
assert
len
(
prompts
)
==
images
.
shape
[
0
]
req_outputs
=
self
.
model
.
generate
(
prompts
,
sampling_params
=
sampling_params
,
multi_modal_data
=
MultiModalData
(
type
=
MultiModalData
.
Type
.
IMAGE
,
data
=
images
)
if
images
is
not
None
else
None
)
outputs
=
[]
outputs
=
[]
for
req_output
in
req_outputs
:
for
req_output
in
req_outputs
:
prompt_str
=
req_output
.
prompt
prompt_str
=
req_output
.
prompt
...
@@ -221,9 +349,10 @@ class VllmRunner:
...
@@ -221,9 +349,10 @@ class VllmRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
images
=
images
)
return
[(
output_ids
[
0
],
output_str
[
0
])
return
[(
output_ids
[
0
],
output_str
[
0
])
for
output_ids
,
output_str
in
outputs
]
for
output_ids
,
output_str
in
outputs
]
...
@@ -254,7 +383,21 @@ class VllmRunner:
...
@@ -254,7 +383,21 @@ class VllmRunner:
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
return
outputs
def
__del__
(
self
):
del
self
.
model
cleanup
()
@
pytest
.
fixture
@
pytest
.
fixture
def
vllm_runner
():
def
vllm_runner
():
return
VllmRunner
return
VllmRunner
def
get_tokenizer_pool_config
(
tokenizer_group_type
):
if
tokenizer_group_type
is
None
:
return
None
if
tokenizer_group_type
==
"ray"
:
return
TokenizerPoolConfig
(
pool_size
=
1
,
pool_type
=
"ray"
,
extra_config
=
{})
raise
ValueError
(
f
"Unknown tokenizer_group_type:
{
tokenizer_group_type
}
"
)
tests/
worker/spec_de
co
d
e/__init__.py
→
tests/co
r
e/__init__.py
View file @
7c4f76e3
File moved
vllm/model_executor/layers/triton_kernel
/__init__.py
→
tests/core/block
/__init__.py
View file @
7c4f76e3
File moved
tests/core/block/e2e/conftest.py
0 → 100644
View file @
7c4f76e3
import
contextlib
import
gc
import
pytest
import
ray
import
torch
from
vllm
import
LLM
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
)
from
vllm.model_executor.utils
import
set_random_seed
def
cleanup
():
destroy_model_parallel
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
ray
.
shutdown
()
@
pytest
.
fixture
def
baseline_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
seed
):
return
create_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
seed
)
@
pytest
.
fixture
def
test_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
test_llm_kwargs
,
seed
):
return
create_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
test_llm_kwargs
,
seed
)
def
create_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
distinct_llm_kwargs
,
seed
):
kwargs
=
{
**
common_llm_kwargs
,
**
per_test_common_llm_kwargs
,
**
distinct_llm_kwargs
,
}
def
generator_inner
():
llm
=
LLM
(
**
kwargs
)
set_random_seed
(
seed
)
yield
llm
del
llm
cleanup
()
for
llm
in
generator_inner
():
yield
llm
del
llm
tests/core/block/e2e/test_correctness.py
0 → 100644
View file @
7c4f76e3
from
itertools
import
cycle
import
pytest
from
vllm
import
SamplingParams
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"forced_num_gpu_blocks"
:
5
*
(
64
+
1
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
"""
output_len
=
1024
temperature
=
0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
def
get_token_ids_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
):
for
llm
in
llm_generator
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
del
llm
return
token_ids
tests/core/block/test_block_space_manager.py
0 → 100644
View file @
7c4f76e3
import
pytest
from
vllm.core.block_manager_v2
import
BlockSpaceManagerV2
from
vllm.core.interfaces
import
AllocStatus
from
..utils
import
create_seq_group
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
8
,
40
,
80
])
@
pytest
.
mark
.
parametrize
(
"num_seqs_per_group"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_seq_group
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManagerV2
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
watermark
=
watermark
,
)
num_watermark_blocks
=
int
(
watermark
*
num_gpu_blocks
)
num_output_blocks_per_seq
=
1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks
=
num_output_blocks_per_seq
for
num_prompt_blocks
in
range
(
1
,
num_gpu_blocks
-
num_output_blocks
):
seq_group
=
create_seq_group
(
seq_prompt_lens
=
block_size
*
num_prompt_blocks
,
seq_output_lens
=
[
block_size
*
num_output_blocks_per_seq
for
_
in
range
(
num_seqs_per_group
)
],
)
assert
num_prompt_blocks
+
num_output_blocks
<=
num_gpu_blocks
can_allocate_result
=
block_manager
.
can_allocate
(
seq_group
)
num_required_blocks
=
num_prompt_blocks
+
num_output_blocks
if
num_gpu_blocks
-
num_required_blocks
<
num_watermark_blocks
:
assert
can_allocate_result
==
AllocStatus
.
NEVER
elif
num_gpu_blocks
>=
num_required_blocks
:
assert
can_allocate_result
==
AllocStatus
.
OK
else
:
assert
can_allocate_result
==
AllocStatus
.
LATER
tests/core/block/test_block_table.py
0 → 100644
View file @
7c4f76e3
import
pytest
from
vllm.core.block.block_table
import
BlockTable
from
vllm.core.block.cpu_gpu_block_allocator
import
CpuGpuBlockAllocator
from
vllm.utils
import
Device
,
cdiv
,
chunk_list
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
def
test_allocate_naive
(
block_size
:
int
,
sequence_len
:
int
):
"""Test the allocation of blocks using the naive allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks. It then allocates multiple BlockTables with varying
sequence lengths and verifies that the number of free blocks decreases as
expected after each allocation.
"""
assert
block_size
>
1
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
"naive"
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
num_blocks_per_alloc
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
block_tables
=
[]
for
i
in
range
(
5
):
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_gpu_blocks
-
i
*
num_blocks_per_alloc
block_tables
.
append
(
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
))
block_tables
[
-
1
].
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
def
test_allocate_prefix_caching
(
block_size
:
int
,
sequence_len
:
int
):
"""Test the allocation of blocks using the prefix caching allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks, using the prefix caching allocator. It then allocates
multiple BlockTables with varying sequence lengths and verifies that the
number of free blocks decreases as expected after each allocation.
The test expects all sequences to share allocations, except for their last
block, which may be mutable. It calculates the expected number of immutable
and mutable blocks per allocation based on the sequence length and block
size.
"""
assert
block_size
>
1
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
"prefix_caching"
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
chunked_tokens
=
list
(
chunk_list
(
token_ids
,
block_size
))
num_mutable_blocks_per_alloc
=
0
if
len
(
chunked_tokens
[
-
1
])
==
block_size
else
1
num_immutable_blocks_per_alloc
=
len
(
chunked_tokens
)
-
num_mutable_blocks_per_alloc
block_tables
=
[]
for
alloc_i
in
range
(
1
,
6
):
block_tables
.
append
(
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
))
block_tables
[
-
1
].
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
# Expect all sequences to share allocations, except for their last block
# (which may be mutable).
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_gpu_blocks
-
(
num_immutable_blocks_per_alloc
+
num_mutable_blocks_per_alloc
*
(
alloc_i
))
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"gpu"
])
def
test_allocate_free
(
block_size
:
int
,
sequence_len
:
int
,
allocator_type
:
str
,
device
:
str
):
"""Test the allocation and freeing of blocks using different allocators and
devices.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, allocator type, and device. It then allocates a BlockTable
multiple times with the same sequence and verifies that the number of free
blocks remains consistent after each allocation and freeing.
"""
device
=
Device
[
device
.
upper
()]
num_device_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_device_blocks
,
num_cpu_blocks
=
num_device_blocks
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
num_blocks_per_alloc
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
for
i
in
range
(
5
):
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
device
)
assert
allocator
.
get_num_free_blocks
(
device
)
==
num_device_blocks
-
num_blocks_per_alloc
assert
all
(
block_id
is
not
None
for
block_id
in
block_table
.
physical_block_ids
)
block_table
.
free
()
assert
allocator
.
get_num_free_blocks
(
device
)
==
num_device_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"append_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_append_token_ids_allocation
(
block_size
:
int
,
sequence_len
:
int
,
append_len
:
int
,
allocator_type
:
str
):
"""Test the allocation behavior when appending token IDs to a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and appends additional token IDs to it. The test verifies
that the number of allocated blocks before and after appending matches the
expected values.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
token_ids_to_append
=
list
(
range
(
append_len
))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
num_expected_blocks_before_append
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
num_expected_appended_blocks
=
len
(
list
(
chunk_list
(
token_ids
+
token_ids_to_append
,
block_size
)))
-
num_expected_blocks_before_append
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
assert
len
(
block_table
.
physical_block_ids
)
==
num_expected_blocks_before_append
block_table
.
append_token_ids
(
token_ids_to_append
)
assert
len
(
block_table
.
physical_block_ids
)
==
num_expected_blocks_before_append
+
num_expected_appended_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"num_empty_slots"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_ensure_num_empty_slots_allocation
(
block_size
:
int
,
sequence_len
:
int
,
num_empty_slots
:
int
,
allocator_type
:
str
):
"""Test the allocation behavior when ensuring a certain number of empty
slots in a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and ensures a certain number of empty slots. The test
verifies that the number of allocated blocks before and after ensuring empty
slots matches the expected values. It also checks that filling up the empty
slots does not consume additional blocks.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
num_expected_blocks_before_append
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
num_expected_appended_blocks
=
len
(
list
(
chunk_list
(
token_ids
+
[
-
1
]
*
num_empty_slots
,
block_size
)))
-
num_expected_blocks_before_append
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
# Assert that the empty slots consume the expected number of additional
# blocks.
assert
len
(
block_table
.
physical_block_ids
)
==
num_expected_blocks_before_append
block_table
.
ensure_num_empty_slots
(
num_empty_slots
)
assert
len
(
block_table
.
physical_block_ids
)
==
num_expected_blocks_before_append
+
num_expected_appended_blocks
# Now, ensure no additional blocks consumed as we fill up the empty slots.
num_free_blocks
=
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
block_table
.
append_token_ids
(
token_ids
=
list
(
range
(
num_empty_slots
)))
assert
num_free_blocks
==
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
9
])
@
pytest
.
mark
.
parametrize
(
"append_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"append_size"
,
[
1
,
4
,
129
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_append_token_ids_correct_content
(
block_size
:
int
,
sequence_len
:
int
,
append_len
:
int
,
allocator_type
:
str
,
append_size
:
int
):
"""Verify token ids are correctly appended. Appends various amounts of
token ids in various append sizes, and verifies the final sequence is
correct.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
token_ids_to_append
=
list
(
range
(
append_len
))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
appended_so_far
=
[]
for
append
in
chunk_list
(
token_ids_to_append
,
append_size
):
block_table
.
append_token_ids
(
append
)
appended_so_far
.
extend
(
append
)
assert
block_table
.
_get_all_token_ids
()
==
token_ids
+
appended_so_far
assert
block_table
.
_get_all_token_ids
()
==
token_ids
+
token_ids_to_append
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
1
,
9
,
129
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_fork
(
seq_len
:
int
,
block_size
:
int
,
allocator_type
:
str
):
"""Create a sequence using the specified allocator.
1. Assert that after forking the sequence, the free block count is the
same.
2. Assert that the forked sequence has the same physical mappings.
3. Then free the original sequence; verify that the free block count is
the same.
4. Finally, free the forked sequence and verify that the free block
count drops to zero.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
seq_len
))
block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
block_table
.
allocate
(
token_ids
)
num_free_blocks_before_fork
=
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
forked_block_table
=
block_table
.
fork
()
# Expect physical_block_ids and token_ids to match.
assert
(
block_table
.
physical_block_ids
==
forked_block_table
.
physical_block_ids
)
assert
block_table
.
_get_all_token_ids
(
)
==
forked_block_table
.
_get_all_token_ids
()
# Do not expect any additional allocations.
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_free_blocks_before_fork
# Free the original blocks. Assert num free blocks does not change, since
# refcount is nonzero.
block_table
.
free
()
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_free_blocks_before_fork
# Expect the forked block table to be unaffected by the free.
assert
all
(
block_id
is
not
None
for
block_id
in
forked_block_table
.
physical_block_ids
)
# Free the forked blocks. Assert num free blocks does change, since
# refcount is now zero.
forked_block_table
.
free
()
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_gpu_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"append_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"appender"
,
[
"forked"
,
"original"
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_cow
(
block_size
:
int
,
sequence_len
:
int
,
append_len
:
int
,
allocator_type
:
str
,
appender
:
str
):
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
token_ids_to_append
=
list
(
range
(
append_len
))
original_block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
num_expected_non_cow_blocks
=
cdiv
(
sequence_len
,
block_size
)
num_expected_cow_blocks
=
cdiv
(
sequence_len
+
append_len
,
block_size
)
-
(
sequence_len
//
block_size
)
original_block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
original_block_ids
=
original_block_table
.
physical_block_ids
forked_block_table
=
original_block_table
.
fork
()
# Expect no additional allocation (copy on _write_).
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
(
num_gpu_blocks
-
num_expected_non_cow_blocks
)
if
appender
==
"forked"
:
appender_block_table
=
forked_block_table
static_block_table
=
original_block_table
elif
appender
==
"original"
:
appender_block_table
=
original_block_table
static_block_table
=
forked_block_table
else
:
raise
ValueError
(
f
"unknown test config
{
appender
=
}
"
)
# Write tokens.
appender_block_table
.
append_token_ids
(
token_ids_to_append
)
# Expect the non-appending block table to have no change.
assert
static_block_table
.
physical_block_ids
==
original_block_ids
assert
appender_block_table
.
physical_block_ids
!=
original_block_ids
# Expect the blocks changed during append to have a CoW.
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
-
(
num_expected_non_cow_blocks
+
num_expected_cow_blocks
)
cows
=
allocator
.
clear_copy_on_writes
()
if
sequence_len
%
block_size
>
0
:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert
cows
cow_block_id
=
sequence_len
//
block_size
expected_src
=
static_block_table
.
physical_block_ids
[
cow_block_id
]
expected_dst
=
appender_block_table
.
physical_block_ids
[
cow_block_id
]
assert
expected_src
in
cows
assert
expected_dst
in
cows
[
expected_src
]
else
:
# Otherwise, there should be no copy-on-write.
assert
not
cows
static_block_table
.
free
()
appender_block_table
.
free
()
# After free, expect all blocks to be freed.
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"sequence_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"append_len"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"lookahead_slots"
,
[
1
,
16
,
129
])
@
pytest
.
mark
.
parametrize
(
"appender"
,
[
"forked"
,
"original"
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_cow_lookahead_simple
(
block_size
:
int
,
sequence_len
:
int
,
append_len
:
int
,
lookahead_slots
:
int
,
allocator_type
:
str
,
appender
:
str
):
"""Similar to test_cow, except with lookahead allocation. The assertions are
less rigorous due to the complexity of the property under test.
"""
num_gpu_blocks
=
1024
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
block_size
=
block_size
,
)
token_ids
=
list
(
range
(
sequence_len
))
token_ids_to_append
=
list
(
range
(
append_len
))
original_block_table
=
BlockTable
(
block_size
=
block_size
,
block_allocator
=
allocator
,
)
original_block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
# Allocate lookahead slots.
original_block_table
.
ensure_num_empty_slots
(
lookahead_slots
)
original_block_ids
=
original_block_table
.
physical_block_ids
forked_block_table
=
original_block_table
.
fork
()
if
appender
==
"forked"
:
appender_block_table
=
forked_block_table
static_block_table
=
original_block_table
elif
appender
==
"original"
:
appender_block_table
=
original_block_table
static_block_table
=
forked_block_table
else
:
raise
ValueError
(
f
"unknown test config
{
appender
=
}
"
)
# Write tokens.
appender_block_table
.
append_token_ids
(
token_ids_to_append
)
# Expect the non-appending block table to have no change.
assert
static_block_table
.
physical_block_ids
==
original_block_ids
assert
appender_block_table
.
physical_block_ids
!=
original_block_ids
cows
=
allocator
.
clear_copy_on_writes
()
# Always expect copy-on-write
assert
cows
if
sequence_len
%
block_size
>
0
:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert
cows
cow_block_id
=
sequence_len
//
block_size
expected_src
=
static_block_table
.
physical_block_ids
[
cow_block_id
]
expected_dst
=
appender_block_table
.
physical_block_ids
[
cow_block_id
]
assert
expected_src
in
cows
assert
expected_dst
in
cows
[
expected_src
]
static_block_table
.
free
()
appender_block_table
.
free
()
# After free, expect all blocks to be freed.
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
tests/core/block/test_common.py
0 → 100644
View file @
7c4f76e3
import
random
import
pytest
from
vllm.core.block.common
import
RefCounter
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
@
pytest
.
mark
.
parametrize
(
"num_incrs"
,
[
1
,
100
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
def
test_incr
(
seed
:
int
,
num_incrs
:
int
,
num_blocks
:
int
):
random
.
seed
(
seed
)
all_block_indices
=
list
(
range
(
num_blocks
))
counter
=
RefCounter
(
all_block_indices
=
all_block_indices
)
block_id
=
random
.
randint
(
0
,
num_blocks
-
1
)
for
i
in
range
(
num_incrs
):
value
=
counter
.
incr
(
block_id
)
assert
value
==
i
+
1
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
@
pytest
.
mark
.
parametrize
(
"num_incrs"
,
[
1
,
100
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
def
test_incr_decr
(
seed
:
int
,
num_incrs
:
int
,
num_blocks
:
int
):
random
.
seed
(
seed
)
all_block_indices
=
list
(
range
(
num_blocks
))
counter
=
RefCounter
(
all_block_indices
=
all_block_indices
)
block_id
=
random
.
randint
(
0
,
num_blocks
-
1
)
for
i
in
range
(
num_incrs
):
value
=
counter
.
incr
(
block_id
)
assert
value
==
i
+
1
for
i
in
range
(
num_incrs
):
value
=
counter
.
decr
(
block_id
)
assert
value
==
num_incrs
-
(
i
+
1
)
with
pytest
.
raises
(
AssertionError
):
counter
.
decr
(
block_id
)
tests/core/block/test_cpu_gpu_block_allocator.py
0 → 100644
View file @
7c4f76e3
import
pytest
from
vllm.core.block.cpu_gpu_block_allocator
import
CpuGpuBlockAllocator
from
vllm.utils
import
Device
,
chunk_list
@
pytest
.
mark
.
parametrize
(
"num_cpu_blocks"
,
[
0
,
512
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_allocate_mutable
(
num_cpu_blocks
:
int
,
num_gpu_blocks
:
int
,
block_size
:
int
,
allocator_type
:
str
):
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
,
block_size
=
block_size
,
)
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
cpu_blocks
=
[
allocator
.
allocate_mutable
(
prev_block
=
None
,
device
=
Device
.
CPU
)
for
_
in
range
(
num_cpu_blocks
)
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
0
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
gpu_blocks
=
[
allocator
.
allocate_mutable
(
prev_block
=
None
,
device
=
Device
.
GPU
)
for
_
in
range
(
num_gpu_blocks
)
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
0
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
0
_
=
[
allocator
.
free
(
block
)
for
block
in
cpu_blocks
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
0
_
=
[
allocator
.
free
(
block
)
for
block
in
gpu_blocks
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
@
pytest
.
mark
.
parametrize
(
"num_cpu_blocks"
,
[
0
,
512
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"allocator_type"
,
[
"naive"
,
"prefix_caching"
])
def
test_allocate_immutable
(
num_cpu_blocks
:
int
,
num_gpu_blocks
:
int
,
block_size
:
int
,
allocator_type
:
str
):
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
allocator_type
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
,
block_size
=
block_size
,
)
unique_token_ids
=
list
(
range
((
num_cpu_blocks
+
num_gpu_blocks
)
*
block_size
))
gpu_token_ids
=
chunk_list
(
unique_token_ids
[:
num_gpu_blocks
*
block_size
],
block_size
)
cpu_token_ids
=
chunk_list
(
unique_token_ids
[
num_gpu_blocks
*
block_size
:],
block_size
)
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
cpu_blocks
=
[
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
,
device
=
Device
.
CPU
)
for
token_ids
in
cpu_token_ids
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
0
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
gpu_blocks
=
[
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
for
token_ids
in
gpu_token_ids
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
0
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
0
_
=
[
allocator
.
free
(
block
)
for
block
in
cpu_blocks
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
0
_
=
[
allocator
.
free
(
block
)
for
block
in
gpu_blocks
]
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
tests/core/block/test_naive_block.py
0 → 100644
View file @
7c4f76e3
from
typing
import
List
,
Optional
import
pytest
from
vllm.core.block.interfaces
import
Block
,
BlockAllocator
from
vllm.core.block.naive_block
import
NaiveBlock
,
NaiveBlockAllocator
class
TestNaiveBlockAllocator
:
@
staticmethod
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
NaiveBlockAllocator
,
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
elif
allocate_type
==
"mutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_mutable
(
prev_block
=
prev_block
)
else
:
raise
ValueError
()
return
allocate_block
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"allocate_type"
,
[
"immutable"
,
"mutable"
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_allocate_ooms
(
allocate_type
:
str
,
num_blocks
:
int
,
block_size
:
int
):
allocator
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocate_block
=
TestNaiveBlockAllocator
.
create_allocate_lambda
(
allocate_type
,
allocator
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)))
[
allocate_block
()
for
_
in
range
(
num_blocks
)]
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocate_block
()
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"allocate_type"
,
[
"immutable"
,
"mutable"
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_free_prevents_oom
(
allocate_type
:
str
,
num_blocks
:
int
,
block_size
:
int
):
allocator
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocate_block
=
TestNaiveBlockAllocator
.
create_allocate_lambda
(
allocate_type
,
allocator
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)))
blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
)]
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocate_block
()
block_to_free
=
blocks
.
pop
()
for
_
in
range
(
100
):
block_id
=
block_to_free
.
block_id
allocator
.
free
(
block_to_free
)
assert
block_to_free
.
block_id
is
None
new_block
=
allocate_block
()
assert
new_block
.
block_id
==
block_id
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocate_block
()
block_to_free
=
new_block
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"allocate_type"
,
[
"immutable"
,
"mutable"
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_get_num_free_blocks
(
allocate_type
:
str
,
num_blocks
:
int
,
block_size
:
int
):
allocator
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocate_block
=
TestNaiveBlockAllocator
.
create_allocate_lambda
(
allocate_type
,
allocator
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)))
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
)]
for
i
,
block
in
enumerate
(
blocks
):
assert
allocator
.
get_num_free_blocks
()
==
i
allocator
.
free
(
block
)
tests/core/block/test_prefix_caching_block.py
0 → 100644
View file @
7c4f76e3
import
math
import
random
from
typing
import
List
,
Optional
from
unittest.mock
import
MagicMock
import
pytest
from
vllm.core.block.interfaces
import
Block
,
BlockAllocator
from
vllm.core.block.prefix_caching_block
import
(
PrefixCachingBlock
,
PrefixCachingBlockAllocator
)
class
TestPrefixCachingBlock
:
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
@
pytest
.
mark
.
parametrize
(
"is_curr_block_full"
,
[
True
,
False
])
def
test_first_block_has_correct_content_hash
(
seed
:
int
,
block_size
:
int
,
is_curr_block_full
:
bool
):
"""Verify a block which is first in the sequence has the correct hash.
"""
random
.
seed
(
seed
)
num_to_fill
=
block_size
if
is_curr_block_full
else
random
.
randint
(
0
,
block_size
-
1
)
token_ids
=
list
(
range
(
num_to_fill
))
mock_allocator
=
MagicMock
(
spec
=
PrefixCachingBlockAllocator
)
block_with_prev
=
PrefixCachingBlock
(
prev_block
=
None
,
token_ids
=
token_ids
,
block_size
=
block_size
,
prefix_caching_allocator
=
mock_allocator
)
if
is_curr_block_full
:
# Expect hash since block is full.
assert
block_with_prev
.
content_hash
==
(
PrefixCachingBlock
.
hash_block_tokens
(
is_first_block
=
True
,
prev_block_hash
=
None
,
cur_block_token_ids
=
token_ids
))
else
:
# Do not expect hash since block is not full.
assert
block_with_prev
.
content_hash
is
None
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
10
)))
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
@
pytest
.
mark
.
parametrize
(
"is_curr_block_full"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"prev_block_has_hash"
,
[
True
,
False
])
def
test_nth_block_has_correct_content_hash
(
seed
:
int
,
block_size
:
int
,
is_curr_block_full
:
bool
,
prev_block_has_hash
:
bool
):
"""Verify a block which is not first in the sequence has the correct
hash.
"""
random
.
seed
(
seed
)
previous_block
=
MagicMock
(
spec
=
PrefixCachingBlock
)
prev_block_hash
=
random
.
randint
(
0
,
1000
)
previous_block
.
content_hash
=
(
prev_block_hash
if
prev_block_has_hash
else
None
)
num_to_fill
=
block_size
if
is_curr_block_full
else
random
.
randint
(
0
,
block_size
-
1
)
token_ids
=
list
(
range
(
num_to_fill
))
mock_allocator
=
MagicMock
(
spec
=
PrefixCachingBlockAllocator
)
block_with_prev
=
PrefixCachingBlock
(
prev_block
=
previous_block
,
token_ids
=
token_ids
,
block_size
=
block_size
,
prefix_caching_allocator
=
mock_allocator
,
)
if
is_curr_block_full
and
prev_block_has_hash
:
# Expect hash since block is full and previous block has hash.
assert
(
block_with_prev
.
content_hash
==
PrefixCachingBlock
.
hash_block_tokens
(
is_first_block
=
False
,
prev_block_hash
=
prev_block_hash
,
cur_block_token_ids
=
token_ids
))
else
:
# Do not expect hash since block is not full or the previous block
# does not have a hash.
assert
block_with_prev
.
content_hash
is
None
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
2
,
16
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
list
(
range
(
3
)))
@
pytest
.
mark
.
parametrize
(
"num_empty_trailing_blocks"
,
[
0
,
1
,
10
])
def
test_blocks_have_correct_hash_in_chain
(
block_size
:
int
,
num_tokens
:
int
,
num_empty_trailing_blocks
:
int
):
"""Create two chains of logical blocks with the same contents.
Assert the hashes are equal.
"""
random
.
seed
(
0
)
token_ids
=
[
random
.
randint
(
0
,
50_000
)
for
_
in
range
(
num_tokens
)]
first_chain
,
second_chain
=
[
TestPrefixCachingBlock
.
create_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
num_empty_trailing_blocks
=
num_empty_trailing_blocks
)
for
_
in
range
(
2
)
]
for
first_chain_block
,
second_chain_block
in
zip
(
first_chain
,
second_chain
):
assert
(
first_chain_block
.
content_hash
==
second_chain_block
.
content_hash
)
if
not
first_chain
or
not
second_chain
:
assert
first_chain
==
second_chain
assert
num_tokens
==
0
@
staticmethod
def
create_chain
(
block_size
:
int
,
token_ids
:
List
[
int
],
num_empty_trailing_blocks
=
0
)
->
List
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
+
num_empty_trailing_blocks
if
num_blocks
==
0
:
return
[]
allocator
=
MagicMock
(
spec
=
PrefixCachingBlockAllocator
)
prev_block
=
None
for
block_number
in
range
(
0
,
num_blocks
):
prev_block
=
PrefixCachingBlock
(
prev_block
=
prev_block
,
token_ids
=
[],
block_size
=
block_size
,
prefix_caching_allocator
=
allocator
,
)
tokens_to_append
=
token_ids
[
block_number
*
block_size
:(
block_number
+
1
)
*
block_size
]
if
tokens_to_append
:
prev_block
.
append_token_ids
(
tokens_to_append
)
blocks
.
append
(
prev_block
)
return
blocks
class
TestPrefixCachingBlockAllocator
:
@
staticmethod
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
BlockAllocator
,
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
elif
allocate_type
==
"mutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_mutable
(
prev_block
=
prev_block
)
else
:
raise
ValueError
()
return
allocate_block
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_allocate_mutable_ooms
(
num_blocks
:
int
,
block_size
:
int
):
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocate_block
=
TestPrefixCachingBlockAllocator
.
create_allocate_lambda
(
allocate_type
=
"mutable"
,
allocator
=
allocator
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)),
)
[
allocate_block
()
for
_
in
range
(
num_blocks
)]
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocate_block
()
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_allocate_immutable_does_not_oom_single_hash
(
num_blocks
:
int
,
block_size
:
int
):
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocate_block
=
TestPrefixCachingBlockAllocator
.
create_allocate_lambda
(
allocate_type
=
"immutable"
,
allocator
=
allocator
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)),
)
blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
)]
# Expect no OOM. If these were mutable blocks, this would OOM.
non_oom_block
=
allocate_block
()
# Expect all blocks to have same physical block index.
for
block
in
blocks
:
assert
(
block
.
block_id
==
non_oom_block
.
block_id
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_allocate_immutable_ooms_many_hash
(
num_blocks
:
int
,
block_size
:
int
):
"""Consume all blocks using many different hashes/block content.
Do this by creating a sequence that is very long.
Expect next block to OOM.
"""
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks
*
block_size
))
chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Expect allocation with unseen hash to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_immutable
(
prev_block
=
chain
[
-
1
],
token_ids
=
list
(
range
(
block_size
)))
# Expect mutable allocation to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
chain
[
-
1
])
# Expect allocation of exact same chain to pass.
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Expect physical block indices to be the same in both chains.
assert
chain
and
second_chain
for
first_chain_block
,
second_chain_block
in
zip
(
chain
,
second_chain
):
assert
(
first_chain_block
.
block_id
==
second_chain_block
.
block_id
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1
,
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
1
,
16
])
def
test_free_prevents_oom
(
num_blocks
:
int
,
block_size
:
int
):
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks
*
block_size
))
chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Expect mutable allocation to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
None
)
block_to_free
=
chain
[
-
1
]
# Expect free/allocate loop to succeed many times.
for
i
in
range
(
100
):
block_id
=
block_to_free
.
block_id
allocator
.
free
(
block_to_free
)
assert
block_to_free
.
block_id
is
None
,
i
new_block
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
assert
new_block
.
block_id
==
block_id
,
i
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
None
)
block_to_free
=
new_block
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
def
test_get_num_free_blocks
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
random
.
seed
(
seed
)
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
num_blocks_to_consume
=
random
.
randint
(
1
,
num_blocks
-
1
)
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks_to_consume
*
block_size
))
chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Free each block in chain, assert num free blocks includes new free
# block.
for
i
,
block
in
enumerate
(
chain
):
assert
allocator
.
get_num_free_blocks
()
==
(
num_blocks
-
num_blocks_to_consume
+
i
)
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
20
)))
def
test_get_num_free_blocks_shared
(
num_blocks
:
int
,
block_size
:
int
,
seed
:
int
):
"""Verify sharing occurs by allocating two sequences that share prefixes
and incrementally freeing blocks.
"""
random
.
seed
(
seed
)
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
num_blocks_to_consume
=
random
.
randint
(
1
,
num_blocks
-
1
)
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks_to_consume
*
block_size
))
first_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Free each block in the first chain. Since all blocks are shared, the
# free count should stay constant.
for
i
,
block
in
enumerate
(
first_chain
):
assert
allocator
.
get_num_free_blocks
()
==
(
num_blocks
-
num_blocks_to_consume
)
allocator
.
free
(
block
)
# Free each block in the second chain. Since the refcount is now zero,
# the free count should increment with each free.
for
i
,
block
in
enumerate
(
second_chain
):
assert
allocator
.
get_num_free_blocks
()
==
(
num_blocks
-
num_blocks_to_consume
+
i
)
allocator
.
free
(
block
)
@
staticmethod
def
create_immutable_chain
(
block_size
:
int
,
token_ids
:
List
[
int
],
allocator
:
PrefixCachingBlockAllocator
,
)
->
List
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
if
num_blocks
==
0
:
return
[]
prev_block
=
None
for
block_number
in
range
(
0
,
num_blocks
):
block_token_ids
=
token_ids
[
block_number
*
block_size
:(
block_number
+
1
)
*
block_size
]
prev_block
=
allocator
.
allocate_immutable
(
prev_block
=
prev_block
,
token_ids
=
block_token_ids
)
blocks
.
append
(
prev_block
)
return
blocks
tests/core/test_block_manager.py
0 → 100644
View file @
7c4f76e3
import
time
from
typing
import
List
import
pytest
from
vllm
import
SamplingParams
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block_manager_v1
import
(
BlockSpaceManagerV1
,
UncachedBlockAllocator
)
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
from
.utils
import
create_dummy_prompt
def
test_block_allocator_allocate
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
num_free
=
num_cpu_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
num_free
-=
1
assert
block
not
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
allocate
()
def
test_block_allocator_free
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
blocks
.
append
(
block
)
assert
block
not
in
cpu_allocator
.
free_blocks
# Free all allocated cpu blocks.
num_free
=
0
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
block
in
blocks
:
cpu_allocator
.
free
(
block
)
num_free
+=
1
assert
block
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
free
(
block
)
def
test_allocate
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
(
num_gpu_blocks
-
1
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_append_slot_single_seq
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate single seq to gpu block.
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Nothing to append. Sequence has no new logical blocks.
assert
block_manager
.
can_append_slot
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slot
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
==
after_blocks
# Add block_size number of new tokens and append slot.
for
i
in
range
(
block_size
):
token_id
=
i
+
5
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
assert
block_manager
.
can_append_slot
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slot
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_append_slot_cow
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
=
block_size
,
num_cpu_blocks
=
num_cpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
watermark
=
0
)
# Allocate prompt to gpu block. There is one slot left in the block.
prompt
=
Sequence
(
seq_id
=
1
,
prompt
=
"one two three"
,
prompt_token_ids
=
[
1
,
2
,
3
],
block_size
=
block_size
)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child
=
prompt
.
fork
(
new_seq_id
=
2
)
# Allocate space for the sequence group.
seq_group
=
SequenceGroup
(
"1"
,
[
prompt
,
child
],
SamplingParams
(),
time
.
time
(),
time
.
perf_counter
)
block_manager
.
allocate
(
seq_group
)
# Fork and append a new token id. We expect a COW to be scheduled.
token_id
=
4
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
can_append_slot
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
maybe_src_dst_block
=
block_manager
.
append_slot
(
child
)
assert
maybe_src_dst_block
is
not
None
src_block
,
dst_block
=
maybe_src_dst_block
assert
src_block
!=
dst_block
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_fork
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
-
1
,
block_size
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Fork prompt and copy block tables.
child
=
prompt
.
fork
(
2
)
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
get_block_table
(
prompt
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
child
)
assert
block_manager
.
get_block_table
(
prompt
)
!=
block_manager
.
get_block_table
(
child
)
def
test_swap
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
list
(
mapping
.
keys
())
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_in
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
list
(
mapping
.
keys
())
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_free
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
prompt_blocks
=
len
(
block_manager
.
get_block_table
(
prompt
))
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed seq is deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
prompt
)
def
test_reset
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_sliding_window_multi_seq
():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size
=
1
num_cpu_blocks
=
8
num_gpu_blocks
=
8
sliding_window
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
sliding_window
=
sliding_window
,
watermark
=
0
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
parent
=
Sequence
(
1
,
"one two three"
,
[
0
,
1
,
2
],
block_size
)
seq_group
=
SequenceGroup
(
"1"
,
[
parent
],
SamplingParams
(),
time
.
time
(),
None
)
block_manager
.
allocate
(
seq_group
)
# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# Fork prompt and copy block tables.
child
=
parent
.
fork
(
2
)
block_manager
.
fork
(
parent
,
child
)
# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# assert both parent and child share all blocks
assert
block_manager
.
get_block_table
(
parent
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
child
)
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
token_id
=
5
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slot
(
parent
)
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
block_table_parent
=
block_manager
.
get_block_table
(
parent
)
block_table_child
=
block_manager
.
get_block_table
(
child
)
assert
block_table_parent
!=
block_table_child
# assert both blocks are sharing the second-last block
assert
block_table_parent
[
-
2
]
==
block_table_child
[
-
2
]
# now let's clean up...
block_manager
.
free
(
parent
)
# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# free all blocks
block_manager
.
free
(
child
)
# assert all blocks are free now
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
tests/core/test_scheduler.py
0 → 100644
View file @
7c4f76e3
import
time
from
typing
import
List
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
.utils
import
create_dummy_prompt
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
test_scheduler_add_seq_group
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq group to scheduler.
num_seq_group
=
4
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
def
test_scheduler_abort_seq_group
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add multiple seq groups to scheduler.
num_seq_group
=
4
request_ids
=
set
()
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
request_ids
.
add
(
str
(
i
))
# Abort all added seq groups.
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
num_seq_group
scheduler
.
abort_seq_group
(
request_ids
)
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
def
test_scheduler_schedule_simple
():
block_size
=
4
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Schedule seq groups prompts.
num_tokens
=
block_size
*
num_seq_group
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_tokens
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
# Schedule seq groups generation.
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
out
.
num_batched_tokens
==
num_seq_group
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
num_seq_group
def
test_scheduler_schedule_preempt_abort
():
block_size
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
block_size
)
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Schedule seq groups prompts.
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
,
seq_group_b
]
assert
out
.
num_batched_tokens
==
block_size
*
2
# seq_a and seq_b
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
2
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
2
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
# processed.
token_id
=
0
seq_a
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
seq_b
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Schedule seq groups generation and preempt seq group b.
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
out
.
num_batched_tokens
==
1
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
2
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
scheduler
.
abort_seq_group
(
"1"
)
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
out
.
num_batched_tokens
==
5
# 4 prompt + 1 generation.
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
assert
len
(
seq_group_meta
)
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
def
test_scheduler_max_seqs
():
block_size
=
4
num_seq_group
=
4
max_seq_group
=
2
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
all_seq_groups
.
append
(
seq_group
)
# Append 1 seq group
scheduler
.
add_seq_group
(
all_seq_groups
[
0
])
# Schedule seq groups prompts.
_
,
out
=
scheduler
.
schedule
()
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
# Schedule seq groups generation.
_
,
out
=
scheduler
.
schedule
()
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
0
]])
# Append 2 more seq group
scheduler
.
add_seq_group
(
all_seq_groups
[
1
])
scheduler
.
add_seq_group
(
all_seq_groups
[
2
])
# Schedule seq groups prompts.
# Only 1 seq group should be scheduled since max_seq_group is 2
# and one is prompting.
_
,
out
=
scheduler
.
schedule
()
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
def
test_scheduler_delay_factor
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# schedule first prompt
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'0'
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
not
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'0'
# wait for more than 0.5 second and try again
time
.
sleep
(
0.6
)
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'1'
tests/core/utils.py
0 → 100644
View file @
7c4f76e3
import
time
from
typing
import
Tuple
from
vllm
import
SamplingParams
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
def
create_dummy_prompt
(
request_id
:
str
,
prompt_length
:
int
,
block_size
:
int
=
None
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt
=
Sequence
(
int
(
request_id
),
prompt_str
,
prompt_tokens
,
block_size
)
seq_group
=
SequenceGroup
(
request_id
,
[
prompt
],
SamplingParams
(),
time
.
time
(),
None
)
return
prompt
,
seq_group
def
create_seq_group
(
seq_prompt_lens
=
1024
,
seq_output_lens
=
(
128
,
),
request_id
=
'0'
,
seq_id_start
=
0
,
)
->
SequenceGroup
:
assert
len
(
seq_output_lens
)
>
0
prompt_token_ids
=
[
0
]
*
seq_prompt_lens
seqs
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
prompt
=
""
,
prompt_token_ids
=
prompt_token_ids
,
block_size
=
16
,
)
for
i
in
range
(
output_len
):
seq
.
append_token_id
(
token_id
=
i
,
logprobs
=
{
i
:
Logprob
(
0.0
)},
)
seqs
.
append
(
seq
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
seqs
,
sampling_params
=
SamplingParams
(),
arrival_time
=
time
.
time
(),
)
return
seq_group
def
round_up_to_next_block
(
seq_len
:
int
,
block_size
:
int
)
->
int
:
return
(
seq_len
+
block_size
-
1
)
//
block_size
tests/distributed/test_basic_distributed_correctness.py
View file @
7c4f76e3
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest
\
test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf
\
test_basic_distributed_correctness.py
```
"""
"""
import
os
import
pytest
import
pytest
import
torch
import
torch
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
os
.
environ
[
"TEST_DIST_MODEL"
],
"meta-llama/Llama-2-7b-hf"
,
]
]
...
...
tests/distributed/test_comm_ops.py
View file @
7c4f76e3
"""Test the communication operators.
"""Test the communication operators.
Run `pytest tests/distributed/test_comm_ops.py
--forked
`.
Run `pytest tests/distributed/test_comm_ops.py`.
"""
"""
import
os
import
pytest
import
pytest
import
torch
import
ray
import
ray
import
torch
from
vllm.model_executor.parallel_utils.communication_op
import
(
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
,
broadcast_tensor_dict
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
broadcast_tensor_dict
,
)
from
vllm.test_utils
import
(
init_test_distributed_environment
,
from
vllm.test_utils
import
(
init_test_distributed_environment
,
multi_process_tensor_parallel
)
multi_process_tensor_parallel
)
...
@@ -18,6 +18,12 @@ from vllm.test_utils import (init_test_distributed_environment,
...
@@ -18,6 +18,12 @@ from vllm.test_utils import (init_test_distributed_environment,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
distributed_init_port
)
num_elements
=
8
num_elements
=
8
...
@@ -34,6 +40,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
...
@@ -34,6 +40,12 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
distributed_init_port
)
num_dimensions
=
3
num_dimensions
=
3
...
@@ -56,6 +68,12 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
...
@@ -56,6 +68,12 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
broadcast_tensor_dict_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
def
broadcast_tensor_dict_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
distributed_init_port
)
test_dict
=
{
test_dict
=
{
...
...
tests/distributed/test_custom_all_reduce.py
View file @
7c4f76e3
import
os
import
random
import
random
import
os
import
pytest
import
pytest
import
ray
import
ray
import
torch
import
torch
...
...
tests/distributed/test_pynccl.py
0 → 100644
View file @
7c4f76e3
import
multiprocessing
import
os
import
pytest
import
torch
from
vllm.model_executor.parallel_utils.pynccl
import
(
NCCLCommunicator
,
ncclGetUniqueId
)
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
=
[]
for
i
in
range
(
number_of_processes
):
env
=
os
.
environ
.
copy
()
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'LOCAL_WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'MASTER_ADDR'
]
=
'localhost'
env
[
'MASTER_PORT'
]
=
'12345'
p
=
multiprocessing
.
Process
(
target
=
fn
,
args
=
(
env
,
))
processes
.
append
(
p
)
p
.
start
()
for
p
in
processes
:
p
.
join
()
def
update_env
(
fn
):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def
wrapper
(
env
):
import
os
os
.
environ
.
update
(
env
)
fn
()
return
wrapper
@
update_env
def
worker_fn
():
comm
=
NCCLCommunicator
()
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
comm
.
rank
)
comm
.
all_reduce
(
tensor
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
comm
.
world_size
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl
():
distributed_run
(
worker_fn
,
2
)
@
update_env
def
worker_fn_with_cudagraph
():
with
torch
.
no_grad
():
graph
=
torch
.
cuda
.
CUDAGraph
()
comm
=
NCCLCommunicator
()
# run something in the default stream to initialize torch engine
a
=
torch
.
ones
((
4
,
4
),
device
=
f
'cuda:
{
comm
.
rank
}
'
)
torch
.
cuda
.
synchronize
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
comm
.
stream
):
# operation during the graph capture is recorded but not executed
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
comm
.
all_reduce
(
a
)
comm
.
stream
.
synchronize
()
assert
a
.
mean
().
cpu
().
item
()
==
comm
.
world_size
**
0
graph
.
replay
()
comm
.
stream
.
synchronize
()
assert
a
.
mean
().
cpu
().
item
()
==
comm
.
world_size
**
1
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
def
test_pynccl_with_cudagraph
():
distributed_run
(
worker_fn_with_cudagraph
,
2
)
def
test_ncclGetUniqueId
():
unique_id
=
ncclGetUniqueId
()
# `list(unique_id.internal)` is something like this:
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# as long as the function doesn't raise an exception, we're good
assert
unique_id
is
not
None
tests/engine/test_computed_prefix_blocks.py
0 → 100644
View file @
7c4f76e3
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# without triggering asserts.
# We are in a scenario where all blocks from the second request's prompt
# are full and already computed when the second request arrives.
prompt
=
(
"You are a helpful assistant. How do I build a car from cardboard and "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
prompt2
=
(
" Please recommend to me some resources where I can learn not only to "
"handle technical difficulties of building a car, but also "
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
()
engine
.
add_request
(
"0"
,
prompt
+
prompt2
,
sampling_params
)
engine
.
step
()
engine
.
add_request
(
"1"
,
prompt
,
sampling_params
)
engine
.
step
()
tests/engine/test_detokenize.py
deleted
100644 → 0
View file @
2da0dd3e
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
detokenize_incrementally
TRUTH
=
[
"Hello here, this is a simple test"
,
# noqa: E501
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving"
,
# noqa: E501
"我很感谢你的热情"
# noqa: E501
]
TOKENIZERS
=
[
"facebook/opt-125m"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/gpt-j-6b"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
"tiiuae/falcon-7b"
,
"meta-llama/Llama-2-7b-hf"
,
"codellama/CodeLlama-7b-hf"
,
]
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
:
bool
):
decoded_text
=
""
offset
=
0
token_offset
=
0
prev_tokens
=
None
for
i
in
range
(
len
(
all_input_ids
)):
new_tokens
,
text
,
offset
,
token_offset
=
detokenize_incrementally
(
tokenizer
,
all_input_ids
[:
i
+
1
],
prev_tokens
,
offset
,
token_offset
,
skip_special_tokens
=
skip_special_tokens
)
decoded_text
+=
text
if
prev_tokens
is
None
:
prev_tokens
=
new_tokens
else
:
prev_tokens
+=
new_tokens
return
decoded_text
@
pytest
.
mark
.
parametrize
(
"truth"
,
TRUTH
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_id"
,
TOKENIZERS
)
@
pytest
.
mark
.
parametrize
(
"skip_special_tokens"
,
(
True
,
False
))
def
test_decode_streaming
(
tokenizer_id
,
truth
,
skip_special_tokens
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_id
)
all_input_ids
=
tokenizer
(
truth
,
add_special_tokens
=
False
)[
"input_ids"
]
if
skip_special_tokens
:
all_input_ids
=
([
tokenizer
.
bos_token_id
]
if
tokenizer
.
bos_token_id
is
not
None
else
[])
+
all_input_ids
+
[
tokenizer
.
eos_token_id
]
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
=
skip_special_tokens
)
assert
decoded_text
==
truth
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment