Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4d3a2c28
Commit
4d3a2c28
authored
Dec 30, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.5' into v0.6.5-dev
parents
92ec5d8e
2d1b9baa
Changes
435
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1693 additions
and
1075 deletions
+1693
-1075
tests/compile/piecewise/__init__.py
tests/compile/piecewise/__init__.py
+0
-0
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_simple.py
+109
-0
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+436
-0
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+142
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+11
-4
tests/compile/test_full_graph_multi_gpu.py
tests/compile/test_full_graph_multi_gpu.py
+0
-22
tests/compile/test_full_graph_smoke.py
tests/compile/test_full_graph_smoke.py
+0
-13
tests/compile/test_functionalization.py
tests/compile/test_functionalization.py
+102
-0
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+116
-0
tests/compile/test_pass_manager.py
tests/compile/test_pass_manager.py
+35
-0
tests/compile/test_wrapper.py
tests/compile/test_wrapper.py
+3
-1
tests/compile/utils.py
tests/compile/utils.py
+17
-23
tests/conftest.py
tests/conftest.py
+344
-192
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+2
-3
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+28
-110
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+13
-11
tests/core/block/test_block_manager.py
tests/core/block/test_block_manager.py
+70
-24
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+10
-9
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+255
-26
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+0
-637
No files found.
Too many changes to show.
To preserve performance only
435 of 435+
files are displayed.
Plain diff
Email patch
tests/compile/piecewise/__init__.py
0 → 100644
View file @
4d3a2c28
tests/compile/piecewise/test_simple.py
0 → 100644
View file @
4d3a2c28
"""
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
"""
import
torch
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.utils
import
direct_register_custom_op
global_counter
=
0
# create a library to hold the custom op
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
global
global_counter
global_counter
+=
1
print
(
f
"
{
global_counter
=
}
"
)
out
.
copy_
(
q
)
out
[
0
]
+=
1
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
support_torch_compile
class
SillyModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Overall effect:
x += 1
x[0] += 2
global_counter += 2
"""
x
=
x
+
1
x
=
x
+
2
out
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
out
)
x
=
out
x
=
x
-
2
x
=
x
-
1
out
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
out
)
x
=
out
x
=
x
+
1
return
x
def
test_simple_piecewise_compile
():
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_copy_inputs
=
True
,
cudagraph_capture_sizes
=
[
1
,
2
],
))
with
set_current_vllm_config
(
vllm_config
):
model
=
SillyModel
(
vllm_config
=
vllm_config
,
prefix
=
''
)
inputs
=
torch
.
randn
(
100
).
cuda
()
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
# one graph for the model
num_piecewise_graphs_seen
=
5
,
# 2 * num_layers + 1
num_piecewise_capturable_graphs_seen
=
3
,
# 1 + num_layers
num_inductor_compilations
=
3
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_caputured
=
6
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
model
(
inputs
)
model
(
torch
.
randn
(
2
).
cuda
())
model
(
torch
.
randn
(
1
).
cuda
())
input
=
torch
.
zeros
(
2
).
cuda
()
global
global_counter
global_counter
=
0
output
=
model
(
input
)
assert
global_counter
==
2
assert
torch
.
allclose
(
output
.
cpu
(),
torch
.
tensor
([
3.
,
1.
]))
tests/compile/piecewise/test_toy_llama.py
0 → 100644
View file @
4d3a2c28
"""
Test the piecewise compilation with a simple model, comparing the output
with and without the piecewise compilation.
This is a tractable model, the weights and computation are specially designed
if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
"""
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Tuple
import
torch
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
dataclass
class
LlamaConfig
:
hidden_size
:
int
=
128
mlp_size
:
int
=
256
vocab_size
:
int
=
128
num_layers
:
int
=
2
init_value
:
float
=
1.0
tractable_init
:
bool
=
False
random_seed
:
int
=
0
def
__post_init__
(
self
):
assert
self
.
mlp_size
>=
self
.
hidden_size
class
LlamaMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
)
->
None
:
super
().
__init__
()
self
.
gate_up_projection
=
nn
.
Linear
(
in_features
=
config
.
hidden_size
,
out_features
=
config
.
mlp_size
*
2
,
bias
=
False
,
)
self
.
down_projection
=
nn
.
Linear
(
in_features
=
config
.
mlp_size
,
out_features
=
config
.
hidden_size
,
bias
=
False
,
)
if
config
.
tractable_init
:
nn
.
init
.
eye_
(
self
.
gate_up_projection
.
weight
.
data
[:
config
.
mlp_size
])
nn
.
init
.
eye_
(
self
.
gate_up_projection
.
weight
.
data
[
config
.
mlp_size
:])
nn
.
init
.
eye_
(
self
.
down_projection
.
weight
.
data
)
else
:
nn
.
init
.
xavier_normal_
(
self
.
gate_up_projection
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
config
.
random_seed
),
gain
=
0.001
)
nn
.
init
.
xavier_normal_
(
self
.
down_projection
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
config
.
random_seed
),
gain
=
0.001
)
def
forward
(
self
,
x
):
# for tractable_init and positive input, this is
# essentially an elementwise-square
x
=
self
.
gate_up_projection
(
x
)
x
=
x
[:,
:
x
.
size
(
1
)
//
2
]
*
torch
.
nn
.
functional
.
relu
(
x
[:,
x
.
size
(
1
)
//
2
:])
x
=
self
.
down_projection
(
x
)
return
x
class
LlamaAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
)
->
None
:
super
().
__init__
()
self
.
qkv_projection
=
nn
.
Linear
(
in_features
=
config
.
hidden_size
,
out_features
=
config
.
hidden_size
*
3
,
bias
=
False
,
)
self
.
output_projection
=
nn
.
Linear
(
in_features
=
config
.
hidden_size
,
out_features
=
config
.
hidden_size
,
bias
=
False
,
)
if
config
.
tractable_init
:
nn
.
init
.
eye_
(
self
.
qkv_projection
.
weight
.
data
[:
config
.
hidden_size
])
nn
.
init
.
eye_
(
self
.
qkv_projection
.
weight
.
data
[
config
.
hidden_size
:
2
*
config
.
hidden_size
])
nn
.
init
.
eye_
(
self
.
qkv_projection
.
weight
.
data
[
2
*
config
.
hidden_size
:])
nn
.
init
.
eye_
(
self
.
output_projection
.
weight
.
data
)
else
:
nn
.
init
.
xavier_normal_
(
self
.
qkv_projection
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
config
.
random_seed
),
gain
=
0.001
)
nn
.
init
.
xavier_normal_
(
self
.
output_projection
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
config
.
random_seed
),
gain
=
0.001
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
# for tractable_init, this is:
# output = (hidden_states * 3 + positions * 2)
qkv
=
self
.
qkv_projection
(
hidden_states
)
hidden_size
=
qkv
.
size
(
-
1
)
//
3
q
,
k
,
v
=
qkv
.
split
([
hidden_size
,
hidden_size
,
hidden_size
],
dim
=-
1
)
q
=
q
+
positions
.
unsqueeze
(
1
)
k
=
k
+
positions
.
unsqueeze
(
1
)
attn_output
=
torch
.
empty_like
(
q
)
torch
.
ops
.
silly
.
attention
(
q
,
k
,
v
,
attn_output
)
output
=
self
.
output_projection
(
attn_output
)
return
output
class
LlamaDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
)
->
None
:
super
().
__init__
()
self
.
self_attention
=
LlamaAttention
(
config
)
self
.
mlp
=
LlamaMLP
(
config
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
For tractable computation:
- if residual is None, the outputs are:
- residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
- hidden_states = (residual + 1) ** 2
- if residual is not None, the outputs are:
- residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
- hidden_states = (residual + 1) ** 2
"""
# noqa
if
residual
is
None
:
residual
=
hidden_states
hidden_states
=
hidden_states
+
1
else
:
hidden_states
=
hidden_states
+
residual
residual
=
hidden_states
hidden_states
=
hidden_states
+
1
hidden_states
=
self
.
self_attention
(
positions
=
positions
,
hidden_states
=
hidden_states
)
hidden_states
=
hidden_states
+
residual
residual
=
hidden_states
hidden_states
=
hidden_states
+
1
hidden_states
=
self
.
mlp
(
hidden_states
)
return
hidden_states
,
residual
@
support_torch_compile
class
LlamaModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
config
:
LlamaConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
self
.
embedding_tokens
=
nn
.
Embedding
(
num_embeddings
=
config
.
vocab_size
,
embedding_dim
=
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
(
[
LlamaDecoderLayer
(
config
)
for
_
in
range
(
config
.
num_layers
)])
# this is the initial value of the hidden states
self
.
embedding_tokens
.
weight
.
data
.
fill_
(
config
.
init_value
)
def
forward
(
self
,
input_ids
:
Optional
[
torch
.
Tensor
],
positions
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embedding_tokens
(
input_ids
)
residual
=
None
for
layer
in
self
.
layers
:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
)
return
hidden_states
def
tractable_computation
(
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
config
:
LlamaConfig
,
init_value
:
float
=
1.0
)
->
torch
.
Tensor
:
hidden_states
=
torch
.
ones
(
input_ids
.
size
(
0
),
config
.
hidden_size
,
device
=
input_ids
.
device
,
dtype
=
input_ids
.
dtype
)
*
init_value
# first layer
residual
=
hidden_states
*
4
+
positions
.
unsqueeze
(
1
)
*
2
+
3
hidden_states
=
(
residual
+
1
)
**
2
# following layers
for
_
in
range
(
config
.
num_layers
-
1
):
hidden_states
=
hidden_states
+
residual
residual
=
hidden_states
*
4
+
positions
.
unsqueeze
(
1
)
*
2
+
3
hidden_states
=
(
residual
+
1
)
**
2
return
hidden_states
@
torch
.
inference_mode
def
run_model
(
llama_config
,
use_compile
:
bool
,
split_attn
:
bool
=
False
)
->
torch
.
Tensor
:
if
use_compile
:
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
cudagraph_capture_sizes
=
[
1
,
2
],
)
if
split_attn
:
compilation_config
.
splitting_ops
=
[
"silly.attention"
]
else
:
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
NO_COMPILATION
,
)
vllm_config
=
VllmConfig
(
compilation_config
=
compilation_config
)
with
set_current_vllm_config
(
vllm_config
):
model
=
LlamaModel
(
config
=
llama_config
,
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
().
cuda
()
B
=
16
# max batch size
input_ids
=
torch
.
randint
(
0
,
llama_config
.
vocab_size
,
(
B
,
)).
cuda
()
positions
=
torch
.
arange
(
B
).
cuda
()
model
(
input_ids
,
positions
)
model
(
input_ids
[:
2
],
positions
[:
2
])
model
(
input_ids
[:
1
],
positions
[:
1
])
input_ids
[:
2
].
zero_
()
output
=
model
(
input_ids
[:
2
],
positions
[:
2
])
output
=
output
.
cpu
()
if
llama_config
.
tractable_init
:
expected_output
=
tractable_computation
(
input_ids
[:
2
],
positions
[:
2
],
llama_config
).
cpu
()
assert
torch
.
allclose
(
output
,
expected_output
)
else
:
return
output
.
cpu
()
def
test_toy_llama
():
# compare output with and without piecewise compilation
llama_config
=
LlamaConfig
(
hidden_size
=
128
,
mlp_size
=
256
,
vocab_size
=
128
,
num_layers
=
12
)
tractable_config
=
LlamaConfig
(
hidden_size
=
128
,
mlp_size
=
256
,
vocab_size
=
128
,
num_layers
=
2
,
tractable_init
=
True
)
outputs
=
[]
with
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
num_piecewise_graphs_seen
=
0
,
num_piecewise_capturable_graphs_seen
=
0
,
num_inductor_compilations
=
0
,
num_cudagraph_caputured
=
0
,
):
outputs
.
append
(
run_model
(
llama_config
,
use_compile
=
False
))
run_model
(
tractable_config
,
use_compile
=
False
)
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
# one graph for the model
num_piecewise_graphs_seen
=
1
,
num_piecewise_capturable_graphs_seen
=
1
,
num_inductor_compilations
=
1
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_caputured
=
2
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
outputs
.
append
(
run_model
(
llama_config
,
use_compile
=
True
))
run_model
(
tractable_config
,
use_compile
=
True
)
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
# one graph for the model
num_piecewise_graphs_seen
=
2
*
llama_config
.
num_layers
+
1
,
# 2 * num_layers + 1
num_piecewise_capturable_graphs_seen
=
1
+
llama_config
.
num_layers
,
# 1 + num_layers
num_inductor_compilations
=
1
+
llama_config
.
num_layers
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_caputured
=
2
*
(
1
+
llama_config
.
num_layers
),
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
outputs
.
append
(
run_model
(
llama_config
,
use_compile
=
True
,
split_attn
=
True
))
run_model
(
tractable_config
,
use_compile
=
True
,
split_attn
=
True
)
for
i
in
range
(
1
,
len
(
outputs
)):
assert
torch
.
allclose
(
outputs
[
0
],
outputs
[
i
])
@
torch
.
inference_mode
def
benchmark
():
from
triton.testing
import
do_bench
# similar to llama 3.1-8B
llama_config
=
LlamaConfig
(
hidden_size
=
4096
,
mlp_size
=
14336
,
vocab_size
=
128
*
1024
,
num_layers
=
32
)
# a tiny model to measure the overhead
# of piecewise cudagraph
llama_config
=
LlamaConfig
(
hidden_size
=
40
,
mlp_size
=
80
,
vocab_size
=
128
,
num_layers
=
2
)
cudagraph_sizes
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
33
)]
eager_time
=
{}
full_cudagraph_time
=
{}
piecewise_cudagraph_time
=
{}
pool
=
torch
.
cuda
.
graph_pool_handle
()
for
piecewise
in
[
False
,
True
]:
if
piecewise
:
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
cudagraph_sizes
,
)
else
:
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
cudagraph_capture_sizes
=
cudagraph_sizes
,
)
vllm_config
=
VllmConfig
(
compilation_config
=
compilation_config
)
with
set_current_vllm_config
(
vllm_config
):
model
=
LlamaModel
(
config
=
llama_config
,
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
().
cuda
().
to
(
torch
.
bfloat16
)
B
=
256
# max batch size
input_ids
=
torch
.
randint
(
0
,
llama_config
.
vocab_size
,
(
B
,
)).
cuda
()
positions
=
torch
.
arange
(
B
).
cuda
().
to
(
torch
.
bfloat16
)
graphs
=
{}
model
(
input_ids
,
positions
)
for
b
in
cudagraph_sizes
[::
-
1
]:
if
not
piecewise
:
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
pool
=
pool
):
output
=
model
(
input_ids
[:
b
],
positions
[:
b
])
graphs
[
b
]
=
(
graph
,
output
)
else
:
output
=
model
(
input_ids
[:
b
],
positions
[:
b
])
graphs
[
b
]
=
(
model
,
output
)
for
b
in
cudagraph_sizes
:
if
piecewise
:
# noqa is for `Function definition does not bind loop variable`
# it will be problematic if we save the created lambda function
# and use it later, because it will look up the name `b` in the
# enclosing scope, and the value of `b` will always be 256.
# it is fine here, because we only use the lambda function once.
runtime
=
do_bench
(
lambda
:
graphs
[
b
][
0
]
# noqa
(
input_ids
[:
b
],
positions
[:
b
]))
# noqa
piecewise_cudagraph_time
[
b
]
=
runtime
else
:
runtime
=
do_bench
(
lambda
:
graphs
[
b
][
0
].
replay
())
# noqa
eager_runtime
=
do_bench
(
lambda
:
model
(
input_ids
[:
b
],
positions
[:
b
]))
# noqa
full_cudagraph_time
[
b
]
=
runtime
eager_time
[
b
]
=
eager_runtime
# print in tabular format
print
(
"batch size
\t
eager mode
\t
full cudagraph
\t
piecewise cudagraph"
)
for
b
in
cudagraph_sizes
:
print
(
f
"
{
b
}
\t
{
eager_time
[
b
]:.
3
f
}
\t
{
full_cudagraph_time
[
b
]:.
3
f
}
"
f
"
\t
{
piecewise_cudagraph_time
[
b
]:.
3
f
}
"
)
if
__name__
==
"__main__"
:
benchmark
()
tests/compile/test_basic_correctness.py
0 → 100644
View file @
4d3a2c28
import
dataclasses
from
typing
import
Dict
,
List
,
Optional
import
pytest
import
os
from
vllm.config
import
CompilationLevel
from
vllm.utils
import
cuda_device_count_stateless
from
..utils
import
compare_all_settings
,
models_path_prefix
@
dataclasses
.
dataclass
class
TestSetting
:
model
:
str
model_args
:
List
[
str
]
pp_size
:
int
tp_size
:
int
attn_backend
:
str
method
:
str
fullgraph
:
bool
# representative settings for testing
test_settings
=
[
# basic llama model
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
),
model_args
=
[],
pp_size
=
2
,
tp_size
=
2
,
attn_backend
=
"FLASHINFER"
,
method
=
"generate"
,
fullgraph
=
True
,
),
# llama model with quantization
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
),
model_args
=
[
"--quantization"
,
"gptq"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate"
,
fullgraph
=
True
,
),
# MoE model
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
),
model_args
=
[],
pp_size
=
1
,
tp_size
=
2
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate"
,
fullgraph
=
True
,
),
# embedding model
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
model_args
=
[
"--task"
,
"embed"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASHINFER"
,
method
=
"encode"
,
fullgraph
=
True
,
),
# encoder-based embedding model (BERT)
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
),
model_args
=
[
"--task"
,
"embed"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"XFORMERS"
,
method
=
"encode"
,
fullgraph
=
True
,
),
# vision language model
TestSetting
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
model_args
=
[
"--trust-remote-code"
,
"--max-model-len"
,
"2048"
],
pp_size
=
2
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"generate_with_image"
,
fullgraph
=
False
,
),
]
# we cannot afford testing the full Catesian product
# of all models and all levels
@
pytest
.
mark
.
parametrize
(
"test_setting"
,
test_settings
)
def
test_compile_correctness
(
test_setting
:
TestSetting
):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
model
=
test_setting
.
model
model_args
=
test_setting
.
model_args
pp_size
=
test_setting
.
pp_size
tp_size
=
test_setting
.
tp_size
attn_backend
=
test_setting
.
attn_backend
method
=
test_setting
.
method
fullgraph
=
test_setting
.
fullgraph
if
cuda_device_count_stateless
()
!=
pp_size
*
tp_size
:
pytest
.
skip
(
"Not correct CUDA devices for the test."
)
import
os
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attn_backend
final_args
=
[
"--enforce-eager"
]
+
model_args
+
[
"-pp"
,
str
(
pp_size
)]
+
\
[
"-tp"
,
str
(
tp_size
)]
all_args
:
List
[
List
[
str
]]
=
[]
all_envs
:
List
[
Optional
[
Dict
[
str
,
str
]]]
=
[]
for
level
in
[
CompilationLevel
.
NO_COMPILATION
,
CompilationLevel
.
PIECEWISE
,
]:
all_args
.
append
(
final_args
+
[
f
"-O
{
level
}
"
])
all_envs
.
append
({})
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings
(
model
,
all_args
,
all_envs
,
method
=
method
if
method
!=
"generate"
else
"generate_close"
)
all_envs
.
clear
()
all_args
.
clear
()
for
level
in
[
CompilationLevel
.
NO_COMPILATION
,
CompilationLevel
.
DYNAMO_AS_IS
,
CompilationLevel
.
DYNAMO_ONCE
,
]:
all_args
.
append
(
final_args
+
[
f
"-O
{
level
}
"
])
all_envs
.
append
({})
if
level
!=
CompilationLevel
.
DYNAMO_ONCE
and
not
fullgraph
:
# "DYNAMO_ONCE" will always use fullgraph
all_envs
[
-
1
][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"0"
# type: ignore
compare_all_settings
(
model
,
all_args
*
3
,
all_envs
,
method
=
method
)
tests/compile/test_full_graph.py
View file @
4d3a2c28
import
pytest
from
vllm.co
mpilation.backends
import
vllm_backend
from
vllm.co
nfig
import
CompilationLevel
from
..utils
import
fork_new_process_for_each_test
from
.utils
import
TEST_MODELS
,
check_full_graph_support
@
pytest
.
mark
.
parametrize
(
"model_info"
,
TEST_MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"eager"
,
vllm_backend
])
def
test_full_graph
(
model_info
,
backend
):
@
pytest
.
mark
.
parametrize
(
"optimization_level"
,
[
CompilationLevel
.
DYNAMO_ONCE
,
CompilationLevel
.
PIECEWISE
])
@
fork_new_process_for_each_test
def
test_full_graph
(
model_info
,
optimization_level
):
model
=
model_info
[
0
]
model_kwargs
=
model_info
[
1
]
check_full_graph_support
(
model
,
model_kwargs
,
backend
,
tp_size
=
1
)
check_full_graph_support
(
model
,
model_kwargs
,
optimization_level
,
tp_size
=
1
)
tests/compile/test_full_graph_multi_gpu.py
deleted
100644 → 0
View file @
92ec5d8e
import
pytest
from
vllm.compilation.backends
import
vllm_backend
from
vllm.utils
import
cuda_device_count_stateless
from
..utils
import
fork_new_process_for_each_test
from
.utils
import
TEST_MODELS_SMOKE
,
check_full_graph_support
@
pytest
.
mark
.
parametrize
(
"model_info"
,
TEST_MODELS_SMOKE
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"eager"
,
vllm_backend
])
@
fork_new_process_for_each_test
def
test_full_graph_multi_gpu
(
model_info
,
tp_size
,
backend
):
model
=
model_info
[
0
]
model_kwargs
=
model_info
[
1
]
# Skip the test if there are not enough CUDA devices.
if
cuda_device_count_stateless
()
<
tp_size
:
pytest
.
skip
(
"Not enough CUDA devices for the test."
)
check_full_graph_support
(
model
,
model_kwargs
,
backend
,
tp_size
=
tp_size
)
tests/compile/test_full_graph_smoke.py
deleted
100644 → 0
View file @
92ec5d8e
import
pytest
from
vllm.compilation.backends
import
vllm_backend
from
.utils
import
TEST_MODELS_SMOKE
,
check_full_graph_support
@
pytest
.
mark
.
parametrize
(
"model_info"
,
TEST_MODELS_SMOKE
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"eager"
,
vllm_backend
])
def
test_full_graph
(
model_info
,
backend
):
model
=
model_info
[
0
]
model_kwargs
=
model_info
[
1
]
check_full_graph_support
(
model
,
model_kwargs
,
backend
,
tp_size
=
1
)
tests/compile/test_functionalization.py
0 → 100644
View file @
4d3a2c28
import
os
import
pytest
import
torch
import
vllm.envs
as
envs
from
vllm
import
LLM
,
SamplingParams
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fusion
import
(
FUSED_OPS
,
FusionPass
,
QuantKey
,
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.reshapes
import
RedundantReshapesPass
from
vllm.config
import
CompilationConfig
from
.backend
import
TestBackend
from
..utils
import
models_path_prefix
OPS_IN_MODEL
=
[
torch
.
ops
.
_C
.
rotary_embedding
.
default
,
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
,
torch
.
ops
.
_C
.
silu_and_mul
.
default
,
]
RMS_OP
=
torch
.
ops
.
_C
.
rms_norm
.
default
RMS_QUANT_OPS
=
{
"static_fp8"
:
[
torch
.
ops
.
_C
.
rms_norm_static_fp8_quant
.
default
,
torch
.
ops
.
_C
.
fused_add_rms_norm_static_fp8_quant
.
default
],
}
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
@
pytest
.
mark
.
parametrize
(
"model, quant_key"
,
[(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
),
kFp8StaticTensorSym
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e"
),
kFp8DynamicTokenSym
)])
@
pytest
.
mark
.
parametrize
(
"do_fusion"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
!=
"cuda"
,
reason
=
"Only test on CUDA"
)
def
test_fix_functionalization
(
model
:
str
,
quant_key
:
QuantKey
,
do_fusion
:
bool
):
torch
.
set_default_device
(
"cuda"
)
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
do_fusion
,
enable_reshape
=
True
)
reshape_pass
=
RedundantReshapesPass
(
config
)
fusion_pass
=
FusionPass
.
instance
(
config
)
passes
=
[
reshape_pass
,
fusion_pass
]
if
do_fusion
else
[
reshape_pass
]
func_pass
=
FixFunctionalizationPass
(
config
)
backend_func
=
TestBackend
(
*
passes
,
func_pass
)
backend_no_func
=
TestBackend
(
*
passes
)
# instantiate a full engine and manually compile the model 2x
# (with and without FixFunctionalizationPass)
llm
=
LLM
(
model
=
model
,
enforce_eager
=
True
)
model_runner
=
llm
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
orig_model
=
model_runner
.
model
# TODO mark inputs dynamic? (currently torch.compile is triggered 4x)
# Can only do that by using the decorator but then we'd have to instantiate
# 2 LLM instances.
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
model_runner
.
model
=
torch
.
compile
(
orig_model
,
fullgraph
=
True
,
backend
=
backend_func
)
gen_func
=
llm
.
generate
(
prompts
,
sampling_params
)
model_runner
.
model
=
torch
.
compile
(
orig_model
,
fullgraph
=
True
,
backend
=
backend_no_func
)
gen_no_func
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output_func
,
output_no_func
in
zip
(
gen_func
,
gen_no_func
):
assert
output_func
.
outputs
[
0
].
text
==
output_no_func
.
outputs
[
0
].
text
# OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion,
# and replaced by fused quantized ops in RMS_QUANT_OPS.
rms_ops
=
[
FUSED_OPS
[(
quant_key
,
True
)],
FUSED_OPS
[(
quant_key
,
False
)]
]
if
do_fusion
else
[
RMS_OP
]
ops
=
OPS_IN_MODEL
+
rms_ops
for
op
in
ops
:
find_auto_fn
(
backend_no_func
.
graph_post_pass
.
nodes
,
op
)
assert
find_auto_fn_maybe
(
backend_func
.
graph_post_pass
.
nodes
,
op
)
is
None
# noqa: E501
# make sure the ops were all de-functionalized
found
=
dict
()
for
node
in
backend_func
.
graph_post_pass
.
nodes
:
for
op
in
ops
:
if
is_func
(
node
,
op
):
found
[
op
]
=
True
assert
all
(
found
[
op
]
for
op
in
ops
)
tests/compile/test_fusion.py
0 → 100644
View file @
4d3a2c28
import
pytest
import
torch
from
compressed_tensors.quantization
import
FP8_DTYPE
import
vllm.envs
as
envs
from
vllm.compilation.fusion
import
(
FUSED_OPS
,
QUANT_OPS
,
FusedRMSQuantKey
,
FusionPass
,
QuantKey
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
from
vllm.compilation.reshapes
import
RedundantReshapesPass
from
vllm.config
import
CompilationConfig
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
apply_fp8_linear
)
from
.backend
import
TestBackend
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
,
static
:
bool
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
if
static
:
self
.
scale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
else
:
self
.
scale
=
[
None
for
_
in
range
(
2
)]
self
.
w
=
[
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
for
_
in
range
(
2
)
]
def
forward
(
self
,
x
):
resid
=
torch
.
sqrt
(
x
)
y
=
self
.
norm
[
0
](
x
)
x2
=
apply_fp8_linear
(
y
,
self
.
w
[
0
],
self
.
wscale
[
0
],
self
.
scale
[
0
],
use_per_token_if_dynamic
=
True
)
# make sure resid is used for replacement to work
y2
,
resid
=
self
.
norm
[
1
](
x2
,
resid
)
x3
=
apply_fp8_linear
(
y2
,
self
.
w
[
1
],
self
.
wscale
[
1
],
self
.
scale
[
1
],
use_per_token_if_dynamic
=
True
)
y3
,
resid
=
self
.
norm
[
2
](
x3
,
resid
)
# use resid here
return
y3
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
64
,
3392
,
4096
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
7
,
256
,
533
,
2048
,
2049
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"static"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
!=
"cuda"
,
reason
=
"Only test on CUDA"
)
def
test_fusion_rmsnorm_quant
(
dtype
,
hidden_size
,
num_tokens
,
eps
,
static
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
# Reshape pass is needed for the fusion pass to work
config
=
CompilationConfig
.
PassConfig
(
enable_fusion
=
True
,
enable_reshape
=
True
)
reshape_pass
=
RedundantReshapesPass
(
config
)
fusion_pass
=
FusionPass
.
instance
(
config
)
backend
=
TestBackend
(
reshape_pass
,
fusion_pass
)
model
=
TestModel
(
hidden_size
,
eps
,
static
)
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
result
=
model
(
x
)
model2
=
torch
.
compile
(
model
,
backend
=
backend
)
result2
=
model2
(
x
)
# Higher tol for dynamic, even higher for bfloat16
if
static
:
ATOL
,
RTOL
=
(
1e-3
,
1e-3
)
elif
dtype
==
torch
.
float16
:
ATOL
,
RTOL
=
(
2e-3
,
2e-3
)
else
:
ATOL
,
RTOL
=
(
1e-2
,
1e-2
)
torch
.
testing
.
assert_close
(
result
,
result2
,
atol
=
ATOL
,
rtol
=
RTOL
)
# Check substitution worked
pre_nodes
=
backend
.
graph_pre_pass
.
nodes
post_nodes
=
backend
.
graph_post_pass
.
nodes
# static is per-tensor, dynamic is per-token
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
static
=
static
,
per_tensor
=
static
,
symmetric
=
True
)
rms_quant
=
FUSED_OPS
[
FusedRMSQuantKey
(
key
,
False
)]
add_rms_quant
=
FUSED_OPS
[
FusedRMSQuantKey
(
key
,
True
)]
fp8_quant
=
QUANT_OPS
[
key
]
# In pre-nodes, fp8 quant should be present and fused kernels should not
assert
find_auto_fn_maybe
(
pre_nodes
,
rms_quant
)
is
None
assert
find_auto_fn_maybe
(
pre_nodes
,
add_rms_quant
)
is
None
find_auto_fn
(
pre_nodes
,
fp8_quant
)
# In post-nodes, fused kernels should be present and fp8 quant should not
find_auto_fn
(
post_nodes
,
rms_quant
)
find_auto_fn
(
post_nodes
,
add_rms_quant
)
assert
find_auto_fn_maybe
(
post_nodes
,
fp8_quant
)
is
None
tests/compile/test_pass_manager.py
0 → 100644
View file @
4d3a2c28
import
pickle
import
pytest
import
torch
from
torch._inductor.codecache
import
BypassFxGraphCache
from
vllm.compilation.config
import
CompilationConfig
from
vllm.compilation.inductor_pass
import
(
CallableInductorPass
,
as_inductor_pass
)
from
vllm.compilation.pass_manager
import
PostGradPassManager
def
simple_callable
(
graph
:
torch
.
fx
.
Graph
):
pass
@
as_inductor_pass
(
files
=
(
__file__
,
))
def
callable_decorated
(
graph
:
torch
.
fx
.
Graph
):
pass
@
pytest
.
mark
.
parametrize
(
"works, callable"
,
[(
False
,
simple_callable
),
(
True
,
callable_decorated
),
(
True
,
CallableInductorPass
(
simple_callable
,
"simple_callable"
))])
def
test_pass_manager
(
works
:
bool
,
callable
):
config
=
CompilationConfig
().
pass_config
pass_manager
=
PostGradPassManager
([
callable
])
pass_manager
.
configure
(
config
)
# Adds default passes
if
works
:
pickle
.
dumps
(
pass_manager
)
else
:
with
pytest
.
raises
(
BypassFxGraphCache
):
pickle
.
dumps
(
pass_manager
)
tests/compile/test_wrapper.py
View file @
4d3a2c28
...
...
@@ -3,6 +3,7 @@ from typing import Optional
import
torch
from
vllm.compilation.wrapper
import
TorchCompileWrapperWithCustomDispatcher
from
vllm.config
import
CompilationLevel
class
MyMod
(
torch
.
nn
.
Module
):
...
...
@@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
def
__init__
(
self
,
model
):
self
.
model
=
model
compiled_callable
=
torch
.
compile
(
self
.
forward
,
backend
=
"eager"
)
super
().
__init__
(
compiled_callable
)
super
().
__init__
(
compiled_callable
,
compilation_level
=
CompilationLevel
.
DYNAMO_ONCE
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
):
# this is the function to be compiled
...
...
tests/compile/utils.py
View file @
4d3a2c28
...
...
@@ -4,18 +4,11 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm.
plugins
import
set_torch_compile_backend
from
vllm.
util
s
import
is_hip
from
vllm.
config
import
CompilationLevel
from
vllm.
platform
s
import
current_platform
import
os
from
..utils
import
models_path_prefix
TEST_MODELS_SMOKE
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
),
{
"quantization"
:
"compressed-tensors"
}),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B"
),
{}),
]
TEST_MODELS
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
{}),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
),
{
...
...
@@ -32,13 +25,12 @@ TEST_MODELS = [
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B"
),
{}),
]
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"aqlm"
):
# noqa: SIM223
if
is_quant_method_supported
(
"aqlm"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
),
{
"quantization"
:
"aqlm"
}))
# TODO:
enable in pytorch 2.5
# TODO:
figure out why this fails.
if
False
and
is_quant_method_supported
(
"gguf"
):
# noqa: SIM223
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
),
{
"quantization"
:
"gguf"
...
...
@@ -64,25 +56,26 @@ if is_quant_method_supported("gptq"):
# "quantization": "marlin"
# }))
if
not
is_hip
()
and
is_quant_method_supported
(
"awq"
):
if
not
current_platform
.
is_rocm
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
),
{
"quantization"
:
"AWQ"
}))
def
check_full_graph_support
(
model
,
model_kwargs
,
backend
,
tp_size
=
1
):
def
check_full_graph_support
(
model
,
model_kwargs
,
optimization_level
,
tp_size
=
1
):
# make sure these models can be captured in full graph mode
if
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
not
in
os
.
environ
:
os
.
environ
[
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
]
=
"1"
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
# Inductor doesn't support fp8/gptq_marlin_24 yet.
quantization
=
model_kwargs
.
get
(
"quantization"
)
if
(
quantization
==
"fp8"
or
quantization
==
"gptq_marlin"
or
quantization
==
"gptq_marlin_24"
)
and
backend
!=
"eager"
:
# The base meta llama uses too much memory.
if
(
model
==
"meta-llama/Meta-Llama-3-8B"
and
optimization_level
>=
CompilationLevel
.
PIECEWISE
):
return
set_torch_compile_backend
(
backend
)
print
(
f
"MODEL=
{
model
}
"
)
prompts
=
[
"Hello, my name is"
,
...
...
@@ -95,6 +88,7 @@ def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
enforce_eager
=
True
,
tensor_parallel_size
=
tp_size
,
disable_custom_all_reduce
=
True
,
compilation_config
=
optimization_level
,
**
model_kwargs
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
tests/conftest.py
View file @
4d3a2c28
import
contextlib
import
gc
import
json
import
os
import
sys
import
tempfile
from
collections
import
UserList
from
enum
import
Enum
...
...
@@ -27,18 +24,19 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
,
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
identity
,
is_cpu
)
identity
)
from
.utils
import
models_path_prefix
logger
=
init_logger
(
__name__
)
...
...
@@ -47,14 +45,16 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
PromptImageInput
=
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]
PromptAudioInput
=
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]
PromptVideoInput
=
Union
[
List
[
np
.
ndarray
],
List
[
List
[
np
.
ndarray
]]]
_M
=
TypeVar
(
"_M"
)
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptAudioInput
=
_PromptMultiModalInput
[
Tuple
[
np
.
ndarray
,
int
]]
PromptVideoInput
=
_PromptMultiModalInput
[
np
.
ndarray
]
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
with
open
(
filename
,
"r"
)
as
f
:
with
open
(
filename
)
as
f
:
prompts
=
f
.
readlines
()
return
prompts
...
...
@@ -64,13 +64,7 @@ class _ImageAssetPrompts(TypedDict):
cherry_blossom
:
str
if
sys
.
version_info
<
(
3
,
9
):
# UserList cannot be subscripted
class
_ImageAssetsBase
(
UserList
):
pass
else
:
class
_ImageAssetsBase
(
UserList
[
ImageAsset
]):
class
_ImageAssetsBase
(
UserList
[
ImageAsset
]):
pass
...
...
@@ -96,13 +90,7 @@ class _VideoAssetPrompts(TypedDict):
sample_demo_1
:
str
if
sys
.
version_info
<
(
3
,
9
):
# UserList cannot be subscripted
class
_VideoAssetsBase
(
UserList
):
pass
else
:
class
_VideoAssetsBase
(
UserList
[
VideoAsset
]):
class
_VideoAssetsBase
(
UserList
[
VideoAsset
]):
pass
...
...
@@ -123,6 +111,23 @@ VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
@
pytest
.
fixture
(
params
=
[
True
,
False
])
def
run_with_both_engines
(
request
,
monkeypatch
):
# Automatically runs tests twice, once with V1 and once without
use_v1
=
request
.
param
# Tests decorated with `@skip_v1` are only run without v1
skip_v1
=
request
.
node
.
get_closest_marker
(
"skip_v1"
)
if
use_v1
:
if
skip_v1
:
pytest
.
skip
(
"Skipping test on vllm V1"
)
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'1'
)
else
:
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
@
pytest
.
fixture
(
autouse
=
True
)
def
init_test_http_connection
():
# pytest_asyncio may use a different event loop per test
...
...
@@ -142,17 +147,7 @@ def dist_init():
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
def
cleanup
():
destroy_model_parallel
()
destroy_distributed_environment
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
if
not
is_cpu
():
torch
.
cuda
.
empty_cache
()
cleanup_dist_env_and_memory
()
@
pytest
.
fixture
()
...
...
@@ -169,7 +164,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
def
cleanup_fixture
(
should_do_global_cleanup_after_test
:
bool
):
yield
if
should_do_global_cleanup_after_test
:
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -244,22 +239,25 @@ def video_assets() -> _VideoAssets:
return
VIDEO_ASSETS
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
)
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
,
dict
)
class
HfRunner
:
def
wrap_device
(
self
,
input
:
_T
)
->
_T
:
if
not
is_cpu
():
# Check if the input is already on the GPU
if
hasattr
(
input
,
'device'
)
and
input
.
device
.
type
==
"cuda"
:
return
input
# Already on GPU, no need to move
return
input
.
to
(
"cuda"
)
else
:
# Check if the input is already on the CPU
if
hasattr
(
input
,
'device'
)
and
input
.
device
.
type
==
"cpu"
:
return
input
# Already on CPU, no need to move
return
input
.
to
(
"cpu"
)
def
wrap_device
(
self
,
x
:
_T
,
device
:
Optional
[
str
]
=
None
)
->
_T
:
if
x
is
None
or
isinstance
(
x
,
(
bool
,
)):
return
x
if
device
is
None
:
device
=
"cpu"
if
current_platform
.
is_cpu
()
else
"cuda"
if
isinstance
(
x
,
dict
):
return
{
k
:
self
.
wrap_device
(
v
,
device
)
for
k
,
v
in
x
.
items
()}
if
hasattr
(
x
,
"device"
)
and
x
.
device
.
type
==
device
:
return
x
return
x
.
to
(
device
)
def
__init__
(
self
,
...
...
@@ -267,23 +265,33 @@ class HfRunner:
dtype
:
str
=
"half"
,
*
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
is_embedding_model
:
bool
=
False
,
is_sentence_transformer
:
bool
=
False
,
is_cross_encoder
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
auto_cls
:
Type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
]
=
identity
,
postprocess_inputs
:
Callable
[...,
BatchEncoding
]
=
identity
,
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model_name
=
model_name
if
is_
embedding_model
:
if
is_
sentence_transformer
:
# Lazy init required for AMD CI
from
sentence_transformers
import
SentenceTransformer
self
.
model
=
self
.
wrap_device
(
SentenceTransformer
(
model_name
,
device
=
"cpu"
,
trust_remote_code
=
True
,
).
to
(
dtype
=
torch_dtype
))
elif
is_cross_encoder
:
# Lazy init required for AMD CI
from
sentence_transformers
import
CrossEncoder
self
.
model
=
CrossEncoder
(
model_name
,
device
=
"cpu"
,
trust_remote_code
=
True
)
self
.
model
.
model
=
self
.
wrap_device
(
self
.
model
.
model
)
\
.
to
(
dtype
=
torch_dtype
)
else
:
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
self
.
model
=
self
.
wrap_device
(
...
...
@@ -294,6 +302,7 @@ class HfRunner:
**
model_kwargs
,
))
if
not
skip_tokenizer_init
:
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
...
...
@@ -308,35 +317,78 @@ class HfRunner:
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
if
skip_tokenizer_init
:
self
.
tokenizer
=
self
.
processor
.
tokenizer
self
.
dtype
=
dtype
self
.
postprocess_inputs
=
postprocess_inputs
def
ge
nerate
(
def
ge
t_inputs
(
self
,
prompts
:
List
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]
]:
if
images
:
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
BatchEncoding
]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
outputs
:
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]
=
[]
if
videos
is
not
None
:
assert
len
(
prompts
)
==
len
(
videos
)
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
all_inputs
:
List
[
BatchEncoding
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
if
videos
is
not
None
and
videos
[
i
]
is
not
None
:
processor_kwargs
[
"videos"
]
=
videos
[
i
]
if
images
is
not
None
and
(
image
:
=
images
[
i
])
is
not
None
:
processor_kwargs
[
"images"
]
=
image
if
videos
is
not
None
and
(
video
:
=
videos
[
i
])
is
not
None
:
processor_kwargs
[
"videos"
]
=
video
if
audios
is
not
None
and
(
audio_tuple
:
=
audios
[
i
])
is
not
None
:
audio
,
sr
=
audio_tuple
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
inputs
=
self
.
postprocess_inputs
(
inputs
,
dtype
=
self
.
dtype
)
all_inputs
.
append
(
inputs
)
return
all_inputs
def
classify
(
self
,
prompts
:
List
[
str
])
->
List
[
str
]:
# output is final logits
all_inputs
=
self
.
get_inputs
(
prompts
)
outputs
=
[]
for
inputs
in
all_inputs
:
output
=
self
.
model
(
**
self
.
wrap_device
(
inputs
))
logits
=
output
.
logits
.
softmax
(
dim
=-
1
)[
0
].
tolist
()
outputs
.
append
(
logits
)
return
outputs
def
generate
(
self
,
prompts
:
List
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
outputs
:
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]
=
[]
for
inputs
in
all_inputs
:
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
use_cache
=
True
,
**
kwargs
,
)
...
...
@@ -354,12 +406,16 @@ class HfRunner:
prompts
:
List
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
images
=
images
,
videos
=
videos
,
audios
=
audios
,
**
kwargs
)
return
[(
output_ids
[
0
],
output_str
[
0
])
...
...
@@ -391,25 +447,19 @@ class HfRunner:
prompts
:
List
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
List
[
torch
.
Tensor
]]:
all_logprobs
:
List
[
List
[
torch
.
Tensor
]]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
if
videos
is
not
None
and
videos
[
i
]
is
not
None
:
processor_kwargs
[
"videos"
]
=
videos
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
all_logprobs
:
List
[
List
[
torch
.
Tensor
]]
=
[]
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
...
...
@@ -417,40 +467,39 @@ class HfRunner:
return_dict_in_generate
=
True
,
**
kwargs
,
)
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
for
hidden_states
in
output
.
hidden_states
:
last_hidden_states
=
hidden_states
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
self
.
model
.
get_output_embeddings
().
bias
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
output
.
hidden_states
)
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
def
_hidden_states_to_logprobs
(
def
_hidden_states_to_
seq_
logprobs
(
self
,
hidden_states
,
num_logprobs
,
)
->
Tuple
[
List
[
Dict
[
int
,
float
]],
int
]:
hidden_states
:
Tuple
[
Tuple
[
torch
.
Tensor
,
...],
...],
)
->
List
[
torch
.
Tensor
]:
output_embeddings
=
self
.
model
.
get_output_embeddings
()
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
output_len
=
len
(
hidden_states
)
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_
output_embeddings
()
.
weight
.
t
(),
last_hidden_states
.
to
(
output_embeddings
.
weight
.
device
)
,
output_embeddings
.
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
().
bias
.
unsqueeze
(
0
)
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
logits
+=
output_embeddings
.
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
return
seq_logprobs
def
_hidden_states_to_logprobs
(
self
,
hidden_states
:
Tuple
[
Tuple
[
torch
.
Tensor
,
...],
...],
num_logprobs
:
int
,
)
->
Tuple
[
List
[
Dict
[
int
,
float
]],
int
]:
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
output_len
=
len
(
hidden_states
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
...
...
@@ -477,33 +526,21 @@ class HfRunner:
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
if
audios
is
not
None
:
audio
,
sr
=
audios
[
i
]
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
if
videos
is
not
None
:
processor_kwargs
[
"videos"
]
=
videos
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
...
...
@@ -534,6 +571,7 @@ class HfRunner:
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
'''
...
...
@@ -544,14 +582,28 @@ class HfRunner:
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
(
encoder_prompt
,
decoder_prompt
)
in
to_enc_dec_tuple_list
(
encoder_decoder_prompts
):
for
i
,
(
encoder_prompt
,
decoder_prompt
)
in
enumerate
(
to_enc_dec_tuple_list
(
encoder_decoder_prompts
)):
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
encoder_prompt
,
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
encoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
encoder_prompt
,
return_tensors
=
"pt"
).
input_ids
)
decoder_input_ids
=
(
None
if
decoder_prompt
is
None
else
self
.
wrap_device
(
self
.
processor
(
**
processor_kwargs
).
input_ids
,
device
=
self
.
model
.
device
.
type
,
)
if
decoder_prompt
is
None
:
decoder_input_ids
=
None
else
:
decoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
decoder_prompt
,
return_tensors
=
"pt"
).
input_ids
))
return_tensors
=
"pt"
).
input_ids
,
device
=
self
.
model
.
device
.
type
,
)
output
=
self
.
model
.
generate
(
encoder_input_ids
,
...
...
@@ -583,12 +635,15 @@ class HfRunner:
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
torch
.
Tensor
]]:
return
self
.
model
.
encode
(
prompts
)
def
predict
(
self
,
prompts
:
List
[
List
[
str
]])
->
torch
.
Tensor
:
return
self
.
model
.
predict
(
prompts
,
convert_to_tensor
=
True
)
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
del
self
.
model
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"session"
)
...
...
@@ -601,7 +656,9 @@ class VllmRunner:
def
__init__
(
self
,
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_mode
:
str
=
"auto"
,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
max_model_len
:
int
=
1024
,
...
...
@@ -616,7 +673,9 @@ class VllmRunner:
)
->
None
:
self
.
model
=
LLM
(
model
=
model_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
tokenizer_mode
=
tokenizer_mode
,
trust_remote_code
=
True
,
dtype
=
dtype
,
swap_space
=
swap_space
,
...
...
@@ -629,20 +688,53 @@ class VllmRunner:
**
kwargs
,
)
def
ge
nerate
(
def
ge
t_inputs
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
TextPrompt
]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
if
videos
is
not
None
:
assert
len
(
prompts
)
==
len
(
videos
)
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
inputs
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
if
images
is
not
None
:
for
i
,
image
in
enumerate
(
images
):
if
image
is
not
None
:
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
if
videos
is
not
None
:
for
i
,
video
in
enumerate
(
videos
):
if
video
is
not
None
:
inputs
[
i
][
"multi_modal_data"
]
=
{
"video"
:
video
}
if
audios
is
not
None
:
for
i
,
audio
in
enumerate
(
audios
):
if
audio
is
not
None
:
inputs
[
i
][
"multi_modal_data"
]
=
{
"audio"
:
audio
}
return
inputs
def
generate
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
...
...
@@ -684,25 +776,10 @@ class VllmRunner:
videos
:
Optional
[
PromptVideoInput
]
=
None
,
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
if
videos
is
not
None
:
assert
len
(
prompts
)
==
len
(
videos
)
inputs
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
if
images
is
not
None
:
for
i
,
image
in
enumerate
(
images
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
if
audios
is
not
None
:
for
i
,
audio
in
enumerate
(
audios
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"audio"
:
audio
}
if
videos
is
not
None
:
for
i
,
video
in
enumerate
(
videos
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"video"
:
video
}
print
(
f
"[INPUTS!!!!]:
{
inputs
}
,
{
sampling_params
}
"
)
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
...
...
@@ -739,9 +816,15 @@ class VllmRunner:
prompts
:
List
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
images
=
images
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
return
[(
output_ids
[
0
],
output_str
[
0
])
for
output_ids
,
output_str
in
outputs
]
...
...
@@ -755,6 +838,7 @@ class VllmRunner:
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
stop
:
Optional
[
List
[
str
]]
=
None
,
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
...
...
@@ -762,7 +846,8 @@ class VllmRunner:
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
num_prompt_logprobs
,
stop_token_ids
=
stop_token_ids
)
stop_token_ids
=
stop_token_ids
,
stop
=
stop
)
return
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
,
...
...
@@ -780,7 +865,6 @@ class VllmRunner:
List
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
use_beam_search
=
False
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
(
num_prompt_logprobs
),
...
...
@@ -793,25 +877,14 @@ class VllmRunner:
encoder_decoder_prompts
,
greedy_logprobs_params
)
def
generate_beam_search
(
self
,
prompts
:
List
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
beam_search_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
def
generate_beam_search_new
(
self
,
prompts
:
Union
[
List
[
str
],
List
[
List
[
int
]]],
beam_width
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
outputs
=
self
.
model
.
beam_search
(
prompts
,
beam_width
,
max_tokens
)
outputs
=
self
.
model
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
returned_outputs
=
[]
for
output
in
outputs
:
token_ids
=
[
x
.
tokens
for
x
in
output
.
sequences
]
...
...
@@ -819,20 +892,39 @@ class VllmRunner:
returned_outputs
.
append
((
token_ids
,
texts
))
return
returned_outputs
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
outputs
=
[]
for
req_output
in
req_outputs
:
embedding
=
req_output
.
outputs
.
embedding
outputs
.
append
(
embedding
)
return
outputs
def
classify
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
req_outputs
=
self
.
model
.
classify
(
prompts
)
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
def
encode
(
self
,
prompts
:
List
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
List
[
float
]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
embed
(
inputs
)
return
[
req_output
.
outputs
.
embedding
for
req_output
in
req_outputs
]
def
score
(
self
,
text_1
:
Union
[
str
,
List
[
str
]],
text_2
:
Union
[
str
,
List
[
str
]],
)
->
List
[
float
]:
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
)
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
del
self
.
model
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"session"
)
...
...
@@ -879,27 +971,30 @@ def num_gpus_available():
# temp_dir = tempfile.gettempdir()
# _dummy_path = os.path.join(temp_dir, "dummy_opt")
_dummy_path
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
_dummy_opt_path
=
os
.
path
.
join
(
models_path_prefix
,
"dummy_opt"
)
_dummy_llava_path
=
os
.
path
.
join
(
models_path_prefix
,
"dummy_llava"
)
_dummy_gemma2_embedding_path
=
os
.
path
.
join
(
models_path_prefix
,
"dummy_gemma2_embedding"
)
@
pytest
.
fixture
def
dummy_opt_path
():
json_path
=
os
.
path
.
join
(
_dummy_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_path
):
json_path
=
os
.
path
.
join
(
_dummy_
opt_
path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_
opt_
path
):
snapshot_download
(
repo_id
=
"facebook/opt-125m"
,
local_dir
=
_dummy_path
,
local_dir
=
_dummy_
opt_
path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyOPTForCausalLM"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_path
return
_dummy_opt_path
# 定义一个 pytest 钩子,在测试后生成报告
@
pytest
.
hookimpl
(
tryfirst
=
True
,
hookwrapper
=
True
)
...
...
@@ -918,3 +1013,60 @@ def pytest_runtest_makereport(item, call):
# 如果测试结果有 extra 属性,则添加截图
if
hasattr
(
result
,
"extra"
):
result
.
extra
.
append
(
pytest_html
.
extras
.
image
(
screenshot_path
))
@
pytest
.
fixture
def
dummy_llava_path
():
json_path
=
os
.
path
.
join
(
_dummy_llava_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_llava_path
):
snapshot_download
(
repo_id
=
"llava-hf/llava-1.5-7b-hf"
,
local_dir
=
_dummy_llava_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyLlava"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_llava_path
@
pytest
.
fixture
def
dummy_gemma2_embedding_path
():
json_path
=
os
.
path
.
join
(
_dummy_gemma2_embedding_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_gemma2_embedding_path
):
snapshot_download
(
repo_id
=
"BAAI/bge-multilingual-gemma2"
,
local_dir
=
_dummy_gemma2_embedding_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyGemma2Embedding"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_gemma2_embedding_path
# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--optional"
,
action
=
"store_true"
,
default
=
False
,
help
=
"run optional test"
)
def
pytest_collection_modifyitems
(
config
,
items
):
if
config
.
getoption
(
"--optional"
):
# --optional given in cli: do not skip optional tests
return
skip_optional
=
pytest
.
mark
.
skip
(
reason
=
"need --optional option to run"
)
for
item
in
items
:
if
"optional"
in
item
.
keywords
:
item
.
add_marker
(
skip_optional
)
tests/core/block/e2e/conftest.py
View file @
4d3a2c28
...
...
@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
import
pytest
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.model_executor.utils
import
set_random_seed
from
....conftest
import
cleanup
@
pytest
.
fixture
def
baseline_llm_generator
(
common_llm_kwargs
,
per_test_common_llm_kwargs
,
...
...
@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
yield
llm
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
for
llm
in
generator_inner
():
yield
llm
...
...
tests/core/block/e2e/test_correctness.py
View file @
4d3a2c28
...
...
@@ -23,32 +23,32 @@ from ....utils import models_path_prefix
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_
v1_v2_greedy_equality
_with_preemption
(
baseline_llm_generator
,
def
test_
block_manager
_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted
in the v2 block manager
.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len
=
1024
temperature
=
0.0
...
...
@@ -72,78 +72,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
temperature
=
temperature
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Use a large block size to trigger more copy-on-writes.
"block_size"
:
32
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_with_cow
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify beam search equality with block manager v1 and v2.
This requires copy-on-writes; if the v1 and v2 output is the same, then
we have some confidence cow is working.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
use_beam_search
=
True
,
best_of
=
2
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
...
...
@@ -166,9 +97,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Lookahead scheduling only supported in v2 block manager.
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
...
...
@@ -280,26 +208,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
"max_num_seqs"
:
10
,
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
{
"use_v2_block_manager"
:
False
,
},
{},
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
0
,
},
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
5
,
},
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_chunked_prefill_block_manager
_v2
(
baseline_llm_generator
,
def
test_chunked_prefill_block_manager
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify that chunked prefill works with
BlockManagerV2, with and without
lookahead scheduling.
"""Verify that chunked prefill works with
SelfAttnBlockSpaceManager,
with and without
lookahead scheduling.
"""
output_len
=
32
temperature
=
0.0
...
...
@@ -320,11 +244,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
temperature
=
temperature
,
)
print
(
'Getting token ids with BlockManager
V1
'
)
print
(
'Getting token ids with BlockManager'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids with BlockManager
V2
'
)
print
(
'Getting token ids with BlockManager
, with lookahead slots.
'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
...
...
@@ -352,32 +276,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
"enable_prefix_caching"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_
v1_v2_greedy_equality
_prefix_caching_enabled_with_preemption
(
def
test_
block_manager
_prefix_caching_enabled_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted
in the v2 block manager
.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len
=
1024
temperature
=
0.0
...
...
@@ -401,11 +325,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
temperature
=
temperature
,
)
print
(
'Getting token ids from block manager
v1
'
)
print
(
'Getting token ids from block manager'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager
v2
'
)
print
(
'Getting token ids from block manager
, with preemption
'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
...
...
@@ -428,9 +352,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Test APC in v2 block
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
...
...
@@ -506,9 +427,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"max_model_len"
:
48
,
"block_size"
:
16
,
"num_gpu_blocks_override"
:
3
,
# Test APC in v2 block
"use_v2_block_manager"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
4d3a2c28
...
...
@@ -4,6 +4,7 @@ from typing import List
import
pytest
import
os
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm
import
LLM
,
SamplingParams
from
.conftest
import
get_text_from_llm_generator
...
...
@@ -26,14 +27,13 @@ BLOCK_SIZE = 16
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
def
test_sliding_window_retrival
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
seed
):
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
asks for value of one of them (which is outside the sliding window).
...
...
@@ -42,6 +42,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
Additionally, we compare the results of the v1 and v2 managers.
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
sampling_params
=
SamplingParams
(
max_tokens
=
1024
,
ignore_eos
=
True
,
...
...
@@ -50,7 +52,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
)
print
(
'Getting token ids from block manager v1'
)
baseline_texts
=
get_text_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
,
...
...
@@ -86,13 +87,12 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"enable_chunked_prefill"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"enable_chunked_prefill"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
):
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
This is similar to test_sliding_window_retrival, however, it doesn't
compare against the v1 block manager since v1 doesn't support
...
...
@@ -101,6 +101,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
The results with and without chunked prefill are not the same due to
numerical instabilities.
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
ignore_eos
=
True
,
...
...
tests/core/block/test_block_manager
_v2
.py
→
tests/core/block/test_block_manager.py
View file @
4d3a2c28
...
...
@@ -2,7 +2,7 @@ import pytest
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager
_v2
import
BlockSpaceManager
V2
from
vllm.core.block_manager
import
SelfAttn
BlockSpaceManager
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
...
...
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_seq_group
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
...
...
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
...
...
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
SWA short for Sliding Window Attention.
At time of writing block manager
v2
does not support SWA.
At time of writing block manager does not support SWA.
However even when SWA is implemented for block manager
v2
,
However even when SWA is implemented for block manager,
there will still most likely be a separate workstream required
to enable SWA for encoder/decoder models.
Therefore this test enforces that one of the following cases
hold true:
1. Block manager
v2
does not support SWA at all (true at time of writing)
2. Block manager
v2
fails with NotImplementError when SWA is enabled
1. Block manager does not support SWA at all (true at time of writing)
2. Block manager fails with NotImplementError when SWA is enabled
AND a SequenceGroup with an encoder sequence (i.e. in support of an
encoder/decoder model) is passed into can_allocate() as an argument
...
...
@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
with
pytest
.
raises
((
NotImplementedError
,
AssertionError
))
as
exc_info
:
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
...
...
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
block_manager
.
can_allocate
(
seq_group
)
# Assert that either
# 1. Block manager
v2
constructor fails with assertion that sliding window
# 1. Block manager constructor fails with assertion that sliding window
# is not yet supported (most likely near-term outcome at time of
# writing), or
# 2. can_allocate() fails with NotImplementedError due to combination of
...
...
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
...
...
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks
=
1024
watermark
=
0.1
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
...
...
@@ -269,7 +269,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager
=
BlockSpaceManager
V2
(
block_size
,
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
...
...
@@ -277,6 +277,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
...
...
@@ -321,7 +322,7 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
can be swapped in/out.
"""
num_cpu_blocks
=
num_gpu_blocks
block_manager
=
BlockSpaceManager
V2
(
block_size
,
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
...
...
@@ -373,6 +374,52 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
0
,
2
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
False
,
True
])
def
test_swap_in_infeasible
(
num_lookahead_slots
,
enable_caching
):
"""Verifies that swapping fails if there is not enough free blocks
to account for unseen tokens and lookahead_slots.
"""
block_size
=
8
num_cpu_blocks
=
1
num_gpu_blocks
=
1
block_manager
=
SelfAttnBlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt_length
=
block_size
-
3
assert
prompt_length
>
0
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
prompt_length
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
assert
block_manager
.
can_swap_out
(
seq_group
)
block_manager
.
swap_out
(
seq_group
)
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
# The number of unseen tokens is 1. If the number of existing
# tokens plus the unseen ones and number of lookahead slots exceeds
# the total number of available GPU blocks then the swap
# should fail.
num_unseen_tokens
=
1
if
(
num_lookahead_slots
+
num_unseen_tokens
+
prompt_length
)
<=
(
block_size
*
num_gpu_blocks
):
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
OK
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
...
...
@@ -388,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks
=
1024
watermark
=
0.1
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
...
...
@@ -400,7 +447,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
if
max_n
is
None
:
max_n
=
min_n
used
=
num_gpu_blocks
-
block_manager
.
get_num_free_gpu_blocks
()
#print("check", min_n, used, max_n)
assert
min_n
<=
used
assert
used
<=
max_n
...
...
@@ -429,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
seq
.
data
.
update_num_computed_tokens
(
prompt_len
)
check_used
(
num_blocks
(
prompt_len
))
# this is how we compute it in BlockSpaceManager
V2
.__init__
# this is how we compute it in
SelfAttn
BlockSpaceManager.__init__
sliding_blocks
=
(
sliding_window
//
block_size
)
+
2
# plus one block for null block
sliding_blocks
+=
1
...
...
tests/core/block/test_naive_block.py
View file @
4d3a2c28
...
...
@@ -104,9 +104,9 @@ class TestNaiveBlockAllocator:
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_naive_block_get_num_blocks_touched
(
num_blocks
,
block_size
):
def
test_naive_block_get_num_
full_
blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
blocks touched
, with different lookahead slots
.
full
blocks touched.
"""
allocator_src
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
...
...
@@ -124,7 +124,7 @@ class TestNaiveBlockAllocator:
src_blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
-
1
)]
# All blocks are cached
assert
allocator_dst
.
get_num_blocks_touched
(
assert
allocator_dst
.
get_num_
full_
blocks_touched
(
src_blocks
)
==
num_blocks
-
1
# Insert one non-full block in the src
...
...
@@ -136,9 +136,10 @@ class TestNaiveBlockAllocator:
src_blocks
.
append
(
allocate_non_full_block
())
src_blocks
[
-
1
].
append_token_ids
([
0
])
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
1
)
==
num_blocks
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
block_size
-
1
)
==
num_blocks
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
block_size
)
==
(
num_blocks
+
1
)
assert
allocator_dst
.
get_num_full_blocks_touched
(
src_blocks
)
==
num_blocks
-
1
# Fill up the last source block and then invoke
# get_num_blocks_touched
src_blocks
[
-
1
].
append_token_ids
([
0
]
*
(
block_size
-
1
))
assert
allocator_dst
.
get_num_full_blocks_touched
(
src_blocks
)
==
num_blocks
tests/core/block/test_prefix_caching_block.py
View file @
4d3a2c28
...
...
@@ -5,9 +5,14 @@ from unittest.mock import MagicMock
import
pytest
from
tests.core.utils
import
create_dummy_lora_sequence
,
create_dummy_sequence
from
vllm.core.block.cpu_gpu_block_allocator
import
CpuGpuBlockAllocator
from
vllm.core.block.interfaces
import
Block
,
BlockAllocator
from
vllm.core.block.prefix_caching_block
import
(
PrefixCachingBlock
,
from
vllm.core.block.prefix_caching_block
import
(
ComputedBlocksTracker
,
PrefixCachingBlock
,
PrefixCachingBlockAllocator
)
from
vllm.sequence
import
Logprob
from
vllm.utils
import
Device
class
TestPrefixCachingBlock
:
...
...
@@ -99,13 +104,11 @@ class TestPrefixCachingBlock:
token_ids
=
[
random
.
randint
(
0
,
50_000
)
for
_
in
range
(
num_tokens
)]
first_chain
,
second_chain
=
[
TestPrefixCachingBlock
.
create_chain
(
first_chain
,
second_chain
=
(
TestPrefixCachingBlock
.
create_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
num_empty_trailing_blocks
=
num_empty_trailing_blocks
)
for
_
in
range
(
2
)
]
for
_
in
range
(
2
))
for
first_chain_block
,
second_chain_block
in
zip
(
first_chain
,
second_chain
):
...
...
@@ -318,11 +321,10 @@ class TestPrefixCachingBlockAllocator:
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_prefix_caching_block_get_num_blocks_touched
(
def
test_prefix_caching_block_get_num_
full_
blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
blocks touched, when there are cached prefixes and different
lookahead slots.
blocks touched, when there are cached prefixes.
"""
allocator_src
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
...
...
@@ -346,28 +348,30 @@ class TestPrefixCachingBlockAllocator:
token_ids
=
token_ids
,
allocator
=
allocator_src
,
)
# All blocks are cached
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
)
==
0
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
0
# Free the first block in the dst
allocator_dst
.
free
(
cached_blocks
[
0
])
# Now the first block becomes dangling, the swapped blocks need
# to reclaim the first block in the dst
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
)
==
1
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
1
# Insert one non-full block in the src
non_full_block
=
allocator_src
.
allocate_mutable_block
(
blocks_to_swap_in
[
-
1
])
non_full_block
.
append_token_ids
([
0
])
blocks_to_swap_in
.
append
(
non_full_block
)
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
1
)
==
2
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
block_size
-
1
)
==
2
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
block_size
)
==
3
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
1
# Fill up the last mutable block and invoke get_num_blocks_touched.
# Note: The last block is not cached so it will be touched.
non_full_block
.
append_token_ids
([
0
]
*
(
block_size
-
1
))
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
2
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
...
...
@@ -727,23 +731,77 @@ class TestPrefixCachingBlockAllocator:
token_ids
=
common_token_ids
,
allocator
=
allocator
,
)
block_
id
s
=
[
block
.
block_id
for
block
in
blocks
]
block_
hashe
s
=
[
block
.
content_hash
for
block
in
blocks
]
# The allocated blocks should be marked as touched
# but not computed.
computed_block_ids
=
allocator
.
get_computed_block_ids
(
[],
block_
ids
,
skip_last_block_id
=
False
)
computed_block_ids
=
allocator
.
find_cached_blocks_prefix
(
block_
hashes
)
assert
len
(
computed_block_ids
)
==
0
allocator
.
mark_blocks_as_computed
([])
computed_block_ids
=
allocator
.
get_computed_block_ids
(
[],
block_
ids
,
skip_last_block_id
=
False
)
computed_block_ids
=
allocator
.
find_cached_blocks_prefix
(
block_
hashes
=
block_hashes
)
assert
len
(
computed_block_ids
)
==
common_blocks
@
staticmethod
def
test_find_cached_blocks_prefix
():
"""
This test verifies the behavior of find_cached_blocks_prefix.
"""
block_size
=
4
num_blocks
=
8
total_test_blocks
=
12
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
token_ids
=
list
(
range
(
total_test_blocks
*
block_size
))
block_tokens_seq1
=
token_ids
[:
num_blocks
*
block_size
]
blocks_seq1
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
block_tokens_seq1
,
allocator
=
allocator
,
)
block_hashes_seq1
=
[
block
.
content_hash
for
block
in
blocks_seq1
]
allocator
.
mark_blocks_as_computed
([])
# All blocks should be cached.
cached_blocks_seq1
=
allocator
.
find_cached_blocks_prefix
(
block_hashes
=
block_hashes_seq1
)
assert
len
(
cached_blocks_seq1
)
==
num_blocks
# Free the first sequence.
for
block
in
blocks_seq1
:
allocator
.
free
(
block
)
# All blocks should be still be cached if not required to be allocated.
cached_blocks
=
allocator
.
find_cached_blocks_prefix
(
block_hashes
=
block_hashes_seq1
)
assert
len
(
cached_blocks
)
==
num_blocks
block_tokens_seq2
=
token_ids
[
num_blocks
*
block_size
:]
blocks_seq2
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
block_tokens_seq2
,
allocator
=
allocator
,
)
block_hashes_seq2
=
[
block
.
content_hash
for
block
in
blocks_seq2
]
allocator
.
mark_blocks_as_computed
([])
cached_blocks
=
allocator
.
find_cached_blocks_prefix
(
block_hashes
=
block_hashes_seq2
)
assert
len
(
cached_blocks
)
==
len
(
blocks_seq2
)
# Half of the blocks from seq1 should still be cached.
num_evicted_blocks
=
len
(
blocks_seq2
)
cached_blocks
=
allocator
.
find_cached_blocks_prefix
(
block_hashes
=
block_hashes_seq1
)
assert
len
(
cached_blocks
)
==
len
(
blocks_seq1
)
-
num_evicted_blocks
@
staticmethod
def
create_immutable_chain
(
block_size
:
int
,
token_ids
:
List
[
int
],
allocator
:
PrefixCachingBlockAllocator
,
extra_hash
:
Optional
[
int
]
=
None
,
)
->
List
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
...
...
@@ -759,7 +817,178 @@ class TestPrefixCachingBlockAllocator:
block_size
:(
block_number
+
1
)
*
block_size
]
prev_block
=
allocator
.
allocate_immutable_block
(
prev_block
=
prev_block
,
token_ids
=
block_token_ids
)
prev_block
=
prev_block
,
token_ids
=
block_token_ids
,
extra_hash
=
extra_hash
)
blocks
.
append
(
prev_block
)
return
blocks
class
TestComputedBlocksTracker
:
@
staticmethod
def
_get_mock_allocator
():
return
MagicMock
(
spec
=
PrefixCachingBlockAllocator
)
@
staticmethod
def
test_get_num_cached_tokens
():
"""
Test it correctly computes the number of cached tokens for a given
sequence:
- The cache token count is derived from the number of cached blocks.
- The cache token count is updated when the allocator is updated.
- When a sequence is removed, the cache token count should be updated
accordingly.
# TODO(rickyx): This behaviour for prefill sequence is a hack until
we fix the computed blocks tracking.
- The cache token count for prefill sequence doesn't change while
the sequence is in continuous prefill (chunked prefill).
"""
block_size
=
4
mock_allocator
=
TestComputedBlocksTracker
.
_get_mock_allocator
()
tracker
=
ComputedBlocksTracker
(
allocator
=
mock_allocator
,
block_size
=
block_size
,
enable_caching
=
True
,
)
# Not yet allocated.
tokens
=
[
0
,
1
,
2
,
3
,
4
,
5
]
seq1
=
create_dummy_sequence
(
request_id
=
0
,
token_ids
=
tokens
,
block_size
=
block_size
)
mock_allocator
.
find_cached_blocks_prefix
.
return_value
=
[]
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
0
mock_allocator
.
find_cached_blocks_prefix
.
return_value
=
[
None
]
# 1 block cached.
# Result is cached for prefill sequence.
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
0
# Mark the sequence as non-prefill.
seq1
.
data
.
update_num_computed_tokens
(
len
(
tokens
))
# 6 tokens computed.
assert
not
seq1
.
is_prefill
()
# Recomputes for decoding sequence.
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
4
# Append new tokens to the sequence.
num_new_tokens
=
3
for
i
in
range
(
num_new_tokens
):
seq1
.
append_token_id
(
i
,
{
i
:
Logprob
(
logprob
=
0.0
)})
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
4
# Update the allocator.
mock_allocator
.
find_cached_blocks_prefix
.
return_value
=
[
None
]
*
2
# 2 blocks cached.
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
8
# Remove the sequence.
tracker
.
remove_seq
(
seq1
.
seq_id
)
# Re-create the sequence with the same request id to simulate recompute.
seq1
=
create_dummy_sequence
(
request_id
=
0
,
token_ids
=
tokens
,
block_size
=
block_size
)
mock_allocator
.
find_cached_blocks_prefix
.
return_value
=
[
]
# no cached block
assert
tracker
.
get_num_cached_tokens
(
seq1
)
==
0
@
staticmethod
def
test_correct_block_hash
():
"""
Test that the block hash is correctly computed for a sequence (should
match the underlying block allocator's block hash). So the number of
cached tokens is correctly retrieved.
"""
block_size
=
4
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
"prefix_caching"
,
num_gpu_blocks
=
16
,
num_cpu_blocks
=
16
,
block_size
=
block_size
,
)
gpu_allocator
=
allocator
.
_allocators
[
Device
.
GPU
]
tracker
=
ComputedBlocksTracker
(
allocator
=
allocator
,
block_size
=
block_size
,
enable_caching
=
True
,
)
tokens
=
list
(
range
(
block_size
*
4
))
# 4 blocks.
seq
=
create_dummy_sequence
(
request_id
=
0
,
token_ids
=
tokens
,
block_size
=
block_size
)
_
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
tokens
,
allocator
=
gpu_allocator
,
)
allocator
.
mark_blocks_as_computed
([])
assert
tracker
.
get_num_cached_tokens
(
seq
)
==
len
(
tokens
)
@
staticmethod
def
test_correct_extra_hash
():
"""
Test that the block hash is correctly computed based on the extra hash,
ensuring it matches the allocator's block hash, specifically for the
LoRA case, and that the correct number of cached tokens is retrieved.
"""
block_size
=
4
allocator
=
CpuGpuBlockAllocator
.
create
(
allocator_type
=
"prefix_caching"
,
num_gpu_blocks
=
16
,
num_cpu_blocks
=
16
,
block_size
=
block_size
,
)
gpu_allocator
=
allocator
.
_allocators
[
Device
.
GPU
]
tracker
=
ComputedBlocksTracker
(
allocator
=
allocator
,
block_size
=
block_size
,
enable_caching
=
True
,
)
tokens
=
list
(
range
(
block_size
*
4
))
# Create a dummy LoRA sequence with a specific LoRA ID.
lora_seq
=
create_dummy_lora_sequence
(
request_id
=
0
,
token_ids
=
tokens
,
block_size
=
block_size
,
lora_int_id
=
1
)
_
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
tokens
,
allocator
=
gpu_allocator
,
extra_hash
=
lora_seq
.
extra_hash
(),
)
allocator
.
mark_blocks_as_computed
([])
# Create different dummy sequences that have the same token IDs
# but different LoRA IDs.
seq
=
create_dummy_sequence
(
request_id
=
1
,
token_ids
=
tokens
,
block_size
=
block_size
)
different_lora_seq
=
create_dummy_lora_sequence
(
request_id
=
2
,
token_ids
=
tokens
,
block_size
=
block_size
,
lora_int_id
=
2
)
# Due to the different LoRA IDs, corresponding blocks are not cached.
assert
tracker
.
get_num_cached_tokens
(
seq
)
==
0
assert
tracker
.
get_num_cached_tokens
(
different_lora_seq
)
==
0
# The number of cached tokens matches the length of the tokens
# for the cached LoRA sequence.
assert
tracker
.
get_num_cached_tokens
(
lora_seq
)
==
len
(
tokens
)
tests/core/test_block_manager.py
deleted
100644 → 0
View file @
92ec5d8e
import
time
from
collections
import
defaultdict
from
typing
import
List
import
pytest
from
vllm
import
SamplingParams
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager_v1
import
(
BlockSpaceManagerV1
,
UncachedBlockAllocator
)
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
from
.utils
import
create_dummy_prompt
,
create_dummy_prompt_encoder_decoder
def
test_block_allocator_allocate
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
num_free
=
num_cpu_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
num_free
-=
1
assert
block
not
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
allocate
()
def
test_block_allocator_free
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
blocks
.
append
(
block
)
assert
block
not
in
cpu_allocator
.
free_blocks
# Free all allocated cpu blocks.
num_free
=
0
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
block
in
blocks
:
cpu_allocator
.
free
(
block
)
num_free
+=
1
assert
block
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
free
(
block
)
def
test_allocate
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
(
num_gpu_blocks
-
1
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
((
num_gpu_blocks
-
1
)
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder_fails_with_swa
():
# SWA short for sliding window attention
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
sliding_window
=
5
)
# swa
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
# Assert that allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
def
test_allocate_encoder_decoder_fails_with_prefix_caching
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
True
)
# Prefix cache
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
# Assert that allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
def
test_append_slot_single_seq
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate single seq to gpu block.
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Nothing to append. Sequence has no new logical blocks.
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slots
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
==
after_blocks
# Add block_size number of new tokens and append slot.
for
i
in
range
(
block_size
):
token_id
=
i
+
5
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slots
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_append_slot_cow
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
=
block_size
,
num_cpu_blocks
=
num_cpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
watermark
=
0
)
# Allocate prompt to gpu block. There is one slot left in the block.
prompt
=
Sequence
(
seq_id
=
1
,
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
1
,
2
,
3
],
},
block_size
=
block_size
)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child
=
prompt
.
fork
(
new_seq_id
=
2
)
# Allocate space for the sequence group.
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
prompt
,
child
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
())
block_manager
.
allocate
(
seq_group
)
# Fork and append a new token id. We expect a COW to be scheduled.
token_id
=
4
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
cows
=
block_manager
.
append_slots
(
child
)
assert
cows
dict_cows
=
defaultdict
(
list
)
for
src_block
,
dst_block
in
cows
:
dict_cows
[
src_block
].
append
(
dst_block
)
for
src_block
,
dst_blocks
in
dict_cows
.
items
():
assert
src_block
not
in
dst_blocks
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_fork
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
-
1
,
block_size
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Fork prompt and copy block tables.
child
=
prompt
.
fork
(
2
)
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
get_block_table
(
prompt
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
child
)
assert
block_manager
.
get_block_table
(
prompt
)
!=
block_manager
.
get_block_table
(
child
)
def
test_swap
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_swap_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
decoder_prompt
.
status
=
SequenceStatus
.
WAITING
encoder_prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
decoder_prompt
.
status
=
SequenceStatus
.
RUNNING
decoder_prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap encoder/decoder seq group from GPU -> CPU.
decoder_gpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_gpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
gpu_blocks
=
decoder_gpu_blocks
+
cross_gpu_blocks
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
#assert list(mapping.keys()) == gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
decoder_prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap encoder/decoder seq group from CPU -> GPU.
decoder_cpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_cpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
cpu_blocks
=
decoder_cpu_blocks
+
cross_cpu_blocks
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_free
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
prompt_blocks
=
len
(
block_manager
.
get_block_table
(
prompt
))
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed seq is deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
prompt
)
def
test_free_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
decoder_prompt_blocks
=
len
(
block_manager
.
get_block_table
(
decoder_prompt
))
encoder_prompt_blocks
=
len
(
block_manager
.
get_cross_block_table
(
seq_group
))
prompt_blocks
=
decoder_prompt_blocks
+
encoder_prompt_blocks
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
decoder_prompt
)
block_manager
.
free_cross
(
seq_group
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
decoder_prompt
)
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
encoder_prompt
)
def
test_reset
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_reset_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
f
"
{
i
}
"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_sliding_window_multi_seq
():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size
=
1
num_cpu_blocks
=
8
num_gpu_blocks
=
8
sliding_window
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
sliding_window
=
sliding_window
,
watermark
=
0
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
parent
=
Sequence
(
seq_id
=
1
,
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
0
,
1
,
2
],
},
block_size
=
block_size
)
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
parent
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(),
lora_request
=
None
)
block_manager
.
allocate
(
seq_group
)
# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# Fork prompt and copy block tables.
child
=
parent
.
fork
(
2
)
block_manager
.
fork
(
parent
,
child
)
# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# assert both parent and child share all blocks
assert
block_manager
.
get_block_table
(
parent
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
child
)
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
token_id
=
5
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
parent
)
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
block_table_parent
=
block_manager
.
get_block_table
(
parent
)
block_table_child
=
block_manager
.
get_block_table
(
child
)
assert
block_table_parent
!=
block_table_child
# assert both blocks are sharing the second-last block
assert
block_table_parent
[
-
2
]
==
block_table_child
[
-
2
]
# now let's clean up...
block_manager
.
free
(
parent
)
# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# free all blocks
block_manager
.
free
(
child
)
# assert all blocks are free now
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
def
test_mark_blocks_as_computed_with_prefix_cache_and_chunked_prefill
():
"""When prefix cache and chunked prefill are enabled, the block manager
should only mark a chunk of blocks as computed instead of all blocks.
"""
block_size
=
4
num_cpu_blocks
=
0
num_gpu_blocks
=
16
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_gpu_blocks
,
num_cpu_blocks
,
watermark
=
0
,
enable_caching
=
True
)
# Set prompt size to have num_gpu_blocks - 1 full blocks.
prompt_length
=
block_size
*
num_gpu_blocks
-
1
# Allocate (reserve) all blocks.
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
,
block_size
=
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
seq_group
.
seqs
[
0
].
n_blocks
==
num_gpu_blocks
# 1st chunk: Compute 2 and half blocks. Should mark 2 blocks as computed.
token_chunk_size
=
int
(
block_size
*
2.5
)
block_manager
.
mark_blocks_as_computed
(
seq_group
,
token_chunk_size
)
computed_blocks
=
block_manager
.
get_all_computed_blocks
(
seq_group
.
seqs
[
0
])
assert
len
(
computed_blocks
)
==
2
# Actual computed tokens.
seq_group
.
seqs
[
0
].
data
.
update_num_computed_tokens
(
token_chunk_size
)
# 2nd chunk: Complete 3rd block and additional 4 blocks.
token_chunk_size
=
int
(
block_size
*
4.5
)
block_manager
.
mark_blocks_as_computed
(
seq_group
,
token_chunk_size
)
computed_blocks
=
block_manager
.
get_all_computed_blocks
(
seq_group
.
seqs
[
0
])
assert
len
(
computed_blocks
)
==
7
Prev
1
…
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment