Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad385667
Commit
ad385667
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.3.post1-dev'
parents
be0967c1
903593d3
Changes
364
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1853 additions
and
1223 deletions
+1853
-1223
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+14
-13
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+92
-0
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+194
-8
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+0
-38
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+25
-113
tests/compile/__init__.py
tests/compile/__init__.py
+0
-0
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+48
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+20
-0
tests/compile/test_wrapper.py
tests/compile/test_wrapper.py
+59
-0
tests/compile/utils.py
tests/compile/utils.py
+98
-0
tests/conftest.py
tests/conftest.py
+526
-153
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+43
-113
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+3
-9
tests/core/block/test_block_manager.py
tests/core/block/test_block_manager.py
+127
-19
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+43
-0
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+112
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+0
-597
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+152
-65
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+80
-0
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+217
-95
No files found.
Too many changes to show.
To preserve performance only
364 of 364+
files are displayed.
Plain diff
Email patch
tests/async_engine/test_request_tracker.py
View file @
ad385667
...
@@ -10,23 +10,23 @@ async def test_request_tracker():
...
@@ -10,23 +10,23 @@ async def test_request_tracker():
stream_1
=
tracker
.
add_request
(
"1"
)
stream_1
=
tracker
.
add_request
(
"1"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
1
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"1"
assert
new
[
0
][
"request_id"
]
==
"1"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_1
.
finished
assert
not
stream_1
.
finished
stream_2
=
tracker
.
add_request
(
"2"
)
stream_2
=
tracker
.
add_request
(
"2"
)
stream_3
=
tracker
.
add_request
(
"3"
)
stream_3
=
tracker
.
add_request
(
"3"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
new
)
==
2
assert
len
(
new
)
==
2
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
not
finish
ed
assert
not
abort
ed
assert
not
stream_2
.
finished
assert
not
stream_2
.
finished
assert
not
stream_3
.
finished
assert
not
stream_3
.
finished
...
@@ -36,9 +36,9 @@ async def test_request_tracker():
...
@@ -36,9 +36,9 @@ async def test_request_tracker():
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
tracker
.
abort_request
(
"1"
)
tracker
.
abort_request
(
"1"
)
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
len
(
finish
ed
)
==
1
assert
len
(
abort
ed
)
==
1
assert
"1"
in
finish
ed
assert
"1"
in
abort
ed
assert
not
new
assert
not
new
assert
stream_1
.
finished
assert
stream_1
.
finished
...
@@ -46,9 +46,11 @@ async def test_request_tracker():
...
@@ -46,9 +46,11 @@ async def test_request_tracker():
tracker
.
abort_request
(
"4"
)
tracker
.
abort_request
(
"4"
)
assert
tracker
.
new_requests_event
.
is_set
()
assert
tracker
.
new_requests_event
.
is_set
()
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
new
,
aborted
=
tracker
.
get_new_and_aborted_requests
()
assert
len
(
finished
)
==
1
# aborted new requests will cancel each other out -
assert
"4"
in
finished
# there's no need for them to propagate into the
# engine
assert
not
aborted
assert
not
new
assert
not
new
assert
stream_4
.
finished
assert
stream_4
.
finished
...
@@ -57,10 +59,9 @@ async def test_request_tracker():
...
@@ -57,10 +59,9 @@ async def test_request_tracker():
tracker
.
process_request_output
(
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
await
tracker
.
wait_for_new_requests
()
await
tracker
.
wait_for_new_requests
()
new
,
finish
ed
=
tracker
.
get_new_and_
finish
ed_requests
()
new
,
abort
ed
=
tracker
.
get_new_and_
abort
ed_requests
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
not
tracker
.
new_requests_event
.
is_set
()
assert
len
(
finished
)
==
1
assert
not
aborted
assert
"2"
in
finished
assert
len
(
new
)
==
1
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"5"
assert
new
[
0
][
"request_id"
]
==
"5"
assert
stream_2
.
finished
assert
stream_2
.
finished
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
ad385667
...
@@ -3,20 +3,27 @@
...
@@ -3,20 +3,27 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
"""
import
os
import
os
import
pickle
import
re
import
weakref
import
weakref
from
unittest.mock
import
patch
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.utils
import
is_hip
from
vllm.utils
import
is_hip
from
vllm.worker.model_runner
import
ModelInputForGPUWithSamplingMetadata
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
multi_gpu_test
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
]
]
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
...
@@ -64,3 +71,88 @@ def test_models(
...
@@ -64,3 +71,88 @@ def test_models(
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, "
"test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
)
->
None
:
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
if
attention_backend
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attention_backend
dtype
=
"half"
max_tokens
=
5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
def
test_model_with_failure
(
vllm_runner
)
->
None
:
try
:
with
patch
(
"vllm.model_executor.models.opt.OPTForCausalLM.forward"
,
side_effect
=
ValueError
()):
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
vllm_runner
(
"facebook/opt-125m"
,
dtype
=
"half"
,
enforce_eager
=
False
,
gpu_memory_utilization
=
0.7
)
matches
=
re
.
search
(
r
"input dumped to (.+).pkl"
,
str
(
exc_info
.
value
))
assert
matches
is
not
None
filename
=
f
"
{
matches
.
group
(
1
)
}
.pkl"
with
open
(
filename
,
"rb"
)
as
filep
:
inputs
=
pickle
.
load
(
filep
)
if
any
(
key
not
in
inputs
for
key
in
(
"arg_1"
,
"arg_2"
,
"arg_3"
)):
raise
AssertionError
(
"Missing keys in dumped inputs. Dumped keys: "
f
"
{
list
(
inputs
.
keys
())
}
"
)
assert
isinstance
(
inputs
[
"arg_1"
],
ModelInputForGPUWithSamplingMetadata
)
finally
:
os
.
remove
(
filename
)
tests/basic_correctness/test_chunked_prefill.py
View file @
ad385667
...
@@ -6,9 +6,13 @@ prefill requests are chunked.
...
@@ -6,9 +6,13 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
"""
import
os
from
contextlib
import
nullcontext
import
pytest
import
pytest
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
from
..utils
import
multi_gpu_test
MODELS
=
[
MODELS
=
[
"facebook/opt-125m"
,
"facebook/opt-125m"
,
...
@@ -35,12 +39,12 @@ def test_models(
...
@@ -35,12 +39,12 @@ def test_models(
enforce_eager
:
bool
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
)
->
None
:
)
->
None
:
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
"""
enable_chunked_prefill
=
False
Checks exact match decode between huggingface model and vllm runner with
max_num_batched_tokens
=
None
chunked prefill.
if
chunked_prefill_token_size
!=
-
1
:
"""
enable_
chunked_prefill
=
Tru
e
max_num_seqs
=
chunked_prefill
_token_siz
e
max_num_batched_tokens
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
...
@@ -49,7 +53,7 @@ def test_models(
...
@@ -49,7 +53,7 @@ def test_models(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
...
@@ -62,3 +66,185 @@ def test_models(
...
@@ -62,3 +66,185 @@ def test_models(
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
if
(
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
):
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,model"
,
[(
"fp8_e4m3"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
)])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
# Due to low-precision numerical divergence, this test is too sensitive to
# the async postprocessor
@
pytest
.
mark
.
parametrize
(
"disable_async_output_proc"
,
[
True
])
def
test_models_with_fp8_kv_cache
(
vllm_runner
,
example_prompts
,
kv_cache_dtype
:
str
,
model
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
disable_async_output_proc
:
bool
,
)
->
None
:
"""
Check output logprobs match between no_chunked_prefill and chunked_prefill
with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
so here we only check chunked prefill.
"""
NUM_LOG_PROBS
=
8
max_num_seqs
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
with
vllm_runner
(
model
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_1_lst
=
chunked_prefill_outputs
,
name_0
=
"no_chunked_prefill"
,
name_1
=
"chunked_prefill"
,
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"chunk_size"
,
[
30
,
32
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
def
test_with_prefix_caching
(
vllm_runner
,
max_tokens
:
int
,
enforce_eager
:
bool
,
chunk_size
:
int
,
tensor_parallel_size
:
int
,
)
->
None
:
"""
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model
=
"meta-llama/Llama-2-7b-chat-hf"
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt
=
"You are a helpful AI assistant "
*
20
unique_prompts
=
[
"Question"
,
# Warmup
"Question"
,
# Fully cached
"Another question"
,
# Partial cached
]
full_prompts
=
[
f
"
{
common_prompt
}
\n
{
p
}
"
for
p
in
unique_prompts
]
max_num_batched_tokens
=
max_num_seqs
=
chunk_size
outputs
=
{}
# type: ignore
check_result
=
True
for
enable
in
(
True
,
False
):
with
vllm_runner
(
model
,
dtype
=
"half"
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
enable
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
)
as
vllm_model
:
# It should fail when prefix caching is enable and chunk
# size is not a multiple of block size (16).
should_fail
=
chunk_size
%
16
!=
0
and
enable
check_result
&=
not
should_fail
outputs
[
enable
]
=
[]
# Send the request one-by-one to ensure the cache is populated.
with
pytest
.
raises
(
ValueError
)
if
should_fail
else
nullcontext
():
for
prompt
in
full_prompts
:
outputs
[
enable
]
+=
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
)
# Check results only if we did not expect a failure.
if
check_result
:
check_outputs_equal
(
outputs_0_lst
=
outputs
[
False
],
outputs_1_lst
=
outputs
[
True
],
name_0
=
"w/o prefix caching"
,
name_1
=
"with prefix caching"
,
)
tests/basic_correctness/test_cpu_offload.py
View file @
ad385667
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
[
"--cpu-offload-gb"
,
"4"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq"
),
reason
=
"awq is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
compare_two_settings
(
"casperhansen/llama-3-8b-instruct-awq"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/basic_correctness/test_preemption.py
View file @
ad385667
...
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
...
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
import
vllm.envs
as
envs
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
ENABLE_ARTIFICIAL_PREEMPT
)
ENABLE_ARTIFICIAL_PREEMPT
)
...
@@ -18,10 +19,20 @@ MODELS = [
...
@@ -18,10 +19,20 @@ MODELS = [
"facebook/opt-125m"
,
"facebook/opt-125m"
,
]
]
assert
ENABLE_ARTIFICIAL_PREEMPT
is
True
,
(
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
@
pytest
.
fixture
(
scope
=
"module"
,
autouse
=
True
)
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
def
check_settings
():
"tests/basic_correctness/test_preemption.py`"
)
assert
ENABLE_ARTIFICIAL_PREEMPT
is
True
,
(
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
"pytest tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
fixture
def
worker_use_ray
()
->
bool
:
# When SPMD worker is used, use ray_use_worker=True
# to test delta input optimization works with preemption.
return
envs
.
VLLM_USE_RAY_SPMD_WORKER
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -36,6 +47,7 @@ def test_chunked_prefill_recompute(
...
@@ -36,6 +47,7 @@ def test_chunked_prefill_recompute(
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
chunked_prefill_token_size
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Ensure that chunked prefill works with preemption."""
"""Ensure that chunked prefill works with preemption."""
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
...
@@ -54,6 +66,8 @@ def test_chunked_prefill_recompute(
...
@@ -54,6 +66,8 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
worker_use_ray
=
worker_use_ray
,
disable_log_stats
=
False
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -69,8 +83,7 @@ def test_chunked_prefill_recompute(
...
@@ -69,8 +83,7 @@ def test_chunked_prefill_recompute(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption
(
def
test_preemption
(
caplog_vllm
,
caplog_vllm
,
...
@@ -80,6 +93,7 @@ def test_preemption(
...
@@ -80,6 +93,7 @@ def test_preemption(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""By default, recompute preemption is enabled"""
"""By default, recompute preemption is enabled"""
...
@@ -90,6 +104,7 @@ def test_preemption(
...
@@ -90,6 +104,7 @@ def test_preemption(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
@@ -121,112 +136,7 @@ def test_preemption(
...
@@ -121,112 +136,7 @@ def test_preemption(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap
(
caplog_vllm
,
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
"""Use beam search enables swapping."""
example_prompts
=
example_prompts
[:
1
]
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
,
disable_log_stats
=
False
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
_
=
hf_outputs
[
i
]
vllm_output_ids
,
_
=
vllm_outputs
[
i
]
assert
len
(
hf_output_ids
)
==
len
(
vllm_output_ids
)
for
j
in
range
(
len
(
hf_output_ids
)):
assert
hf_output_ids
[
j
]
==
vllm_output_ids
[
j
],
(
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"vLLM:
{
vllm_output_ids
}
"
)
assert
(
"is preempted by PreemptionMode.SWAP mode because there "
"is not enough KV cache space."
in
caplog_vllm
.
text
)
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
preemption_metrics
=
None
for
m
in
REGISTRY
.
collect
():
if
m
.
name
==
"vllm:num_preemptions"
:
preemption_metrics
=
m
assert
preemption_metrics
is
not
None
total_recorded_preemption
=
0
for
sample
in
preemption_metrics
.
samples
:
total_recorded_preemption
+=
sample
.
value
assert
total_preemption
==
total_recorded_preemption
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap_infeasible
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE
=
16
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
example_prompts
=
example_prompts
[:
1
]
with
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
,
block_size
=
BLOCK_SIZE
,
# Since beam search have more than 1 sequence, prefill +
# decode blocks are not enough to finish.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
temperature
=
0.0
,
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
# Verify the request is ignored and not hang.
assert
req_outputs
[
0
].
outputs
[
0
].
finish_reason
==
"length"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption_infeasible
(
def
test_preemption_infeasible
(
vllm_runner
,
vllm_runner
,
...
@@ -234,6 +144,7 @@ def test_preemption_infeasible(
...
@@ -234,6 +144,7 @@ def test_preemption_infeasible(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
...
@@ -248,6 +159,7 @@ def test_preemption_infeasible(
...
@@ -248,6 +159,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
# ignored instead of hanging forever.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
worker_use_ray
=
worker_use_ray
,
)
as
vllm_model
:
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
ignore_eos
=
True
)
...
@@ -263,4 +175,4 @@ def test_preemption_infeasible(
...
@@ -263,4 +175,4 @@ def test_preemption_infeasible(
for
req_output
in
req_outputs
:
for
req_output
in
req_outputs
:
outputs
=
req_output
.
outputs
outputs
=
req_output
.
outputs
assert
len
(
outputs
)
==
1
assert
len
(
outputs
)
==
1
assert
outputs
[
0
].
finish_reason
==
"length"
assert
outputs
[
0
].
finish_reason
==
"length"
\ No newline at end of file
vllm/model_executor/layers/ops
/__init__.py
→
tests/compile
/__init__.py
View file @
ad385667
File moved
tests/compile/test_basic_correctness.py
0 → 100644
View file @
ad385667
from
typing
import
Dict
,
List
,
Optional
import
pytest
from
vllm.compilation.levels
import
CompilationLevel
from
vllm.utils
import
cuda_device_count_stateless
from
..utils
import
compare_all_settings
# we cannot afford testing the full Catesian product
# of all models and all levels
@
pytest
.
mark
.
parametrize
(
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph"
,
[
(
"meta-llama/Meta-Llama-3-8B"
,
[],
2
,
2
,
"FLASH_ATTN"
,
"generate"
,
True
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
,
[
"--quantization"
,
"compressed-tensors"
],
1
,
1
,
"FLASH_ATTN"
,
"generate"
,
True
),
(
"google/gemma-2-2b-it"
,
[],
1
,
2
,
"FLASHINFER"
,
"generate"
,
True
),
# TODO: add multi-modality test for llava
(
"llava-hf/llava-1.5-7b-hf"
,
[],
2
,
1
,
"FLASHINFER"
,
"generate"
,
False
)
])
def
test_compile_correctness
(
model
,
model_args
,
pp_size
,
tp_size
,
attn_backend
,
method
,
fullgraph
):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
if
cuda_device_count_stateless
()
!=
pp_size
*
tp_size
:
pytest
.
skip
(
"Not correct CUDA devices for the test."
)
import
os
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attn_backend
if
not
fullgraph
:
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"0"
all_args
=
[[
"--enforce-eager"
]
+
model_args
+
[
"--max_model_len"
,
"1024"
]
+
[
"-pp"
,
str
(
pp_size
)]
+
[
"-tp"
,
str
(
tp_size
)]]
*
3
# don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
# inductor will change the output, so we cannot compare them.
all_envs
:
List
[
Optional
[
Dict
[
str
,
str
]]]
=
[{
"VLLM_TORCH_COMPILE_LEVEL"
:
str
(
level
)
}
for
level
in
[
CompilationLevel
.
NO_COMPILATION
,
CompilationLevel
.
DYNAMO_AS_IS
,
CompilationLevel
.
DYNAMO_ONCE
,
]]
compare_all_settings
(
model
,
all_args
,
all_envs
,
method
=
method
)
tests/compile/test_full_graph.py
0 → 100644
View file @
ad385667
import
pytest
from
vllm.compilation.levels
import
CompilationLevel
from
..utils
import
fork_new_process_for_each_test
from
.utils
import
TEST_MODELS
,
check_full_graph_support
@
pytest
.
mark
.
parametrize
(
"model_info"
,
TEST_MODELS
)
@
pytest
.
mark
.
parametrize
(
"optimization_level"
,
[
CompilationLevel
.
DYNAMO_ONCE
,
CompilationLevel
.
INDUCTOR
])
@
fork_new_process_for_each_test
def
test_full_graph
(
model_info
,
optimization_level
):
model
=
model_info
[
0
]
model_kwargs
=
model_info
[
1
]
check_full_graph_support
(
model
,
model_kwargs
,
optimization_level
,
tp_size
=
1
)
tests/compile/test_wrapper.py
0 → 100644
View file @
ad385667
from
typing
import
Optional
import
torch
from
vllm.compilation.wrapper
import
TorchCompileWrapperWithCustomDispatcher
class
MyMod
(
torch
.
nn
.
Module
):
def
forward
(
self
,
x
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
):
if
cache
is
not
None
:
return
x
+
cache
return
x
*
2
class
MyWrapper
(
TorchCompileWrapperWithCustomDispatcher
):
def
__init__
(
self
,
model
):
self
.
model
=
model
compiled_callable
=
torch
.
compile
(
self
.
forward
,
backend
=
"eager"
)
super
().
__init__
(
compiled_callable
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
):
# this is the function to be compiled
return
self
.
model
(
x
,
cache
)
def
__call__
(
self
,
x
:
torch
.
Tensor
,
cache
:
Optional
[
torch
.
Tensor
]
=
None
):
# let torch.compile compile twice
if
len
(
self
.
compiled_codes
)
==
2
:
dispatch_id
=
0
if
cache
is
None
else
1
with
self
.
dispatch_to_code
(
dispatch_id
):
return
self
.
forward
(
x
,
cache
)
else
:
return
self
.
compiled_callable
(
x
,
cache
)
def
test_torch_compile_wrapper
():
mod
=
MyMod
()
wrappers
=
[]
for
i
in
range
(
3
):
torch
.
_dynamo
.
reset
()
wrapper
=
MyWrapper
(
mod
)
wrappers
.
append
(
wrapper
)
x
=
torch
.
tensor
([
1
])
wrapper
(
x
,
None
)
# profile run, compile
# create a cache tensor
cache
=
torch
.
tensor
([
2
])
wrapper
(
x
,
cache
)
# warm up with cache, recompile
# for new input, dispatch to the compiled code directly
new_x
=
torch
.
tensor
([
3
])
assert
wrapper
(
new_x
,
None
).
item
()
==
6
# dispatch to the first compiled code
assert
wrapper
(
new_x
,
cache
).
item
()
==
5
# dispatch to the second compiled code
for
wrapper
in
wrappers
:
# make sure they have independent compiled codes
assert
len
(
wrapper
.
compiled_codes
)
==
2
tests/compile/utils.py
0 → 100644
View file @
ad385667
import
os
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm.compilation.levels
import
CompilationLevel
from
vllm.utils
import
is_hip
TEST_MODELS
=
[
(
"facebook/opt-125m"
,
{}),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"fp8"
}),
(
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
,
{
"quantization"
:
"compressed-tensors"
}),
(
"meta-llama/Meta-Llama-3-8B"
,
{}),
]
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"aqlm"
):
# noqa: SIM223
TEST_MODELS
.
append
((
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
,
{
"quantization"
:
"aqlm"
}))
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"gguf"
):
# noqa: SIM223
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
,
{
"quantization"
:
"gguf"
}))
if
is_quant_method_supported
(
"gptq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
{
"quantization"
:
"gptq"
}))
if
is_quant_method_supported
(
"gptq_marlin"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
{
"quantization"
:
"gptq_marlin"
}))
if
is_quant_method_supported
(
"gptq_marlin_24"
):
TEST_MODELS
.
append
((
"alexm-nm/tinyllama-24-marlin24-4bit-g128"
,
{
"quantization"
:
"gptq_marlin_24"
}))
if
is_quant_method_supported
(
"marlin"
):
TEST_MODELS
.
append
((
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
,
{
"quantization"
:
"marlin"
}))
if
not
is_hip
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
{
"quantization"
:
"AWQ"
}))
def
check_full_graph_support
(
model
,
model_kwargs
,
optimization_level
,
tp_size
=
1
):
# make sure these models can be captured in full graph mode
os
.
environ
[
"VLLM_TORCH_COMPILE_LEVEL"
]
=
str
(
optimization_level
)
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
# Inductor doesn't support fp8/gptq_marlin_24 yet.
quantization
=
model_kwargs
.
get
(
"quantization"
)
if
(
quantization
==
"fp8"
or
quantization
==
"gptq_marlin"
or
quantization
==
"gptq_marlin_24"
)
and
optimization_level
>=
CompilationLevel
.
INDUCTOR
:
return
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
model
,
enforce_eager
=
True
,
tensor_parallel_size
=
tp_size
,
disable_custom_all_reduce
=
True
,
**
model_kwargs
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
tests/conftest.py
View file @
ad385667
import
contextlib
import
contextlib
import
gc
import
gc
import
json
import
os
import
os
import
sys
import
sys
import
tempfile
from
collections
import
UserList
from
collections
import
UserList
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
,
Union
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
TypedDict
,
TypeVar
,
Union
)
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
huggingface_hub
import
snapshot_download
from
PIL
import
Image
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
AutoModelForVision2Seq
,
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
BatchEncoding
,
AutoTokenizer
,
BatchEncoding
,
BatchFeature
)
BatchFeature
)
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
tests.models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.config
import
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
)
destroy_model_parallel
,
from
vllm.inputs
import
TextPrompt
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
SampleLogprobs
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
is_cpu
)
identity
,
is_cpu
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -31,6 +45,11 @@ _TEST_DIR = os.path.dirname(__file__)
...
@@ -31,6 +45,11 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
PromptImageInput
=
Union
[
List
[
Image
.
Image
],
List
[
List
[
Image
.
Image
]]]
PromptAudioInput
=
Union
[
List
[
Tuple
[
np
.
ndarray
,
int
]],
List
[
List
[
Tuple
[
np
.
ndarray
,
int
]]]]
PromptVideoInput
=
Union
[
List
[
np
.
ndarray
],
List
[
List
[
np
.
ndarray
]]]
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
with
open
(
filename
,
"r"
)
as
f
:
with
open
(
filename
,
"r"
)
as
f
:
...
@@ -71,8 +90,35 @@ class _ImageAssets(_ImageAssetsBase):
...
@@ -71,8 +90,35 @@ class _ImageAssets(_ImageAssetsBase):
return
[
prompts
[
"stop_sign"
],
prompts
[
"cherry_blossom"
]]
return
[
prompts
[
"stop_sign"
],
prompts
[
"cherry_blossom"
]]
class
_VideoAssetPrompts
(
TypedDict
):
sample_demo_1
:
str
if
sys
.
version_info
<
(
3
,
9
):
# UserList cannot be subscripted
class
_VideoAssetsBase
(
UserList
):
pass
else
:
class
_VideoAssetsBase
(
UserList
[
VideoAsset
]):
pass
class
_VideoAssets
(
_VideoAssetsBase
):
def
__init__
(
self
)
->
None
:
super
().
__init__
([
VideoAsset
(
"sample_demo_1.mp4"
),
])
def
prompts
(
self
,
prompts
:
_VideoAssetPrompts
)
->
List
[
str
]:
return
[
prompts
[
"sample_demo_1"
]]
IMAGE_ASSETS
=
_ImageAssets
()
IMAGE_ASSETS
=
_ImageAssets
()
"""Singleton instance of :class:`_ImageAssets`."""
"""Singleton instance of :class:`_ImageAssets`."""
VIDEO_ASSETS
=
_VideoAssets
()
"""Singleton instance of :class:`_VideoAssets`."""
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -82,6 +128,21 @@ def init_test_http_connection():
...
@@ -82,6 +128,21 @@ def init_test_http_connection():
global_http_connection
.
reuse_client
=
False
global_http_connection
.
reuse_client
=
False
@
pytest
.
fixture
def
dist_init
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
def
cleanup
():
def
cleanup
():
destroy_model_parallel
()
destroy_model_parallel
()
destroy_distributed_environment
()
destroy_distributed_environment
()
...
@@ -99,10 +160,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
...
@@ -99,10 +160,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
to initialize torch.
to initialize torch.
"""
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
not
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
)
return
False
return
True
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -112,6 +170,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
...
@@ -112,6 +170,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup
()
cleanup
()
@
pytest
.
fixture
(
autouse
=
True
)
def
dynamo_reset
():
yield
torch
.
_dynamo
.
reset
()
@
pytest
.
fixture
@
pytest
.
fixture
def
example_prompts
()
->
List
[
str
]:
def
example_prompts
()
->
List
[
str
]:
prompts
=
[]
prompts
=
[]
...
@@ -120,6 +184,46 @@ def example_prompts() -> List[str]:
...
@@ -120,6 +184,46 @@ def example_prompts() -> List[str]:
return
prompts
return
prompts
class
DecoderPromptType
(
Enum
):
"""For encoder/decoder models only."""
CUSTOM
=
1
NONE
=
2
EMPTY_STR
=
3
@
pytest
.
fixture
def
example_encoder_decoder_prompts
(
)
->
Dict
[
DecoderPromptType
,
List
[
ExplicitEncoderDecoderPrompt
]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
decoder prompt) tuple.
Returns:
* Encoder prompt list
* Decoder prompt list (reverse of encoder prompt list)
'''
encoder_prompts
=
[]
for
filename
in
_TEST_PROMPTS
:
encoder_prompts
+=
_read_prompts
(
filename
)
custom_decoder_prompts
=
encoder_prompts
[::
-
1
]
empty_str_decoder_prompts
=
[
""
]
*
len
(
encoder_prompts
)
none_decoder_prompts
=
[
None
]
*
len
(
encoder_prompts
)
# NONE decoder prompt type
return
{
DecoderPromptType
.
NONE
:
zip_enc_dec_prompts
(
encoder_prompts
,
none_decoder_prompts
),
DecoderPromptType
.
EMPTY_STR
:
zip_enc_dec_prompts
(
encoder_prompts
,
empty_str_decoder_prompts
),
DecoderPromptType
.
CUSTOM
:
zip_enc_dec_prompts
(
encoder_prompts
,
custom_decoder_prompts
),
}
@
pytest
.
fixture
@
pytest
.
fixture
def
example_long_prompts
()
->
List
[
str
]:
def
example_long_prompts
()
->
List
[
str
]:
prompts
=
[]
prompts
=
[]
...
@@ -133,16 +237,24 @@ def image_assets() -> _ImageAssets:
...
@@ -133,16 +237,24 @@ def image_assets() -> _ImageAssets:
return
IMAGE_ASSETS
return
IMAGE_ASSETS
@
pytest
.
fixture
(
scope
=
"session"
)
def
video_assets
()
->
_VideoAssets
:
return
VIDEO_ASSETS
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
)
_T
=
TypeVar
(
"_T"
,
nn
.
Module
,
torch
.
Tensor
,
BatchEncoding
,
BatchFeature
)
class
HfRunner
:
class
HfRunner
:
def
wrap_device
(
self
,
input
:
_T
)
->
_T
:
def
wrap_device
(
self
,
input
:
_T
,
device
:
Optional
[
str
]
=
None
)
->
_T
:
if
not
is_cpu
():
if
device
is
None
:
return
input
.
to
(
"cuda"
)
return
self
.
wrap_device
(
input
,
"cpu"
if
is_cpu
()
else
"cuda"
)
else
:
return
input
.
to
(
"cpu"
)
if
hasattr
(
input
,
"device"
)
and
input
.
device
.
type
==
device
:
return
input
return
input
.
to
(
device
)
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -150,27 +262,25 @@ class HfRunner:
...
@@ -150,27 +262,25 @@ class HfRunner:
dtype
:
str
=
"half"
,
dtype
:
str
=
"half"
,
*
,
*
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
is_embedding_model
:
bool
=
False
,
is_sentence_transformer
:
bool
=
False
,
is_vision_model
:
bool
=
False
,
auto_cls
:
Type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
]
=
identity
,
)
->
None
:
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model_name
=
model_name
self
.
model_name
=
model_name
if
is_
embedding_model
:
if
is_
sentence_transformer
:
# Lazy init required for AMD CI
# Lazy init required for AMD CI
from
sentence_transformers
import
SentenceTransformer
from
sentence_transformers
import
SentenceTransformer
self
.
model
=
self
.
wrap_device
(
self
.
model
=
self
.
wrap_device
(
SentenceTransformer
(
SentenceTransformer
(
model_name
,
model_name
,
device
=
"cpu"
,
device
=
"cpu"
,
trust_remote_code
=
True
,
).
to
(
dtype
=
torch_dtype
))
).
to
(
dtype
=
torch_dtype
))
else
:
else
:
if
is_vision_model
:
auto_cls
=
AutoModelForVision2Seq
else
:
auto_cls
=
AutoModelForCausalLM
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
model_kwargs
=
model_kwargs
if
model_kwargs
is
not
None
else
{}
self
.
model
=
self
.
wrap_device
(
self
.
model
=
self
.
wrap_device
(
auto_cls
.
from_pretrained
(
auto_cls
.
from_pretrained
(
...
@@ -186,31 +296,34 @@ class HfRunner:
...
@@ -186,31 +296,34 @@ class HfRunner:
trust_remote_code
=
True
,
trust_remote_code
=
True
,
)
)
try
:
# don't put this import at the top level
# don't put this import at the top level
# it will call torch.cuda.device_count()
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
# noqa: F401
from
transformers
import
AutoProcessor
# noqa: F401
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
model_name
,
torch_dtype
=
torch_dtype
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
)
)
except
Exception
:
logger
.
warning
(
"Unable to auto-load processor from HuggingFace for "
"model %s. Using tokenizer instead."
,
model_name
)
self
.
processor
=
self
.
tokenizer
def
generate
(
self
.
postprocess_inputs
=
postprocess_inputs
def
get_inputs
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
**
kwargs
:
Any
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
audios
:
Optional
[
PromptAudioInput
]
=
None
,
if
images
:
)
->
List
[
BatchEncoding
]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
outputs
:
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]
=
[]
if
videos
is
not
None
:
assert
len
(
prompts
)
==
len
(
videos
)
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
all_inputs
:
List
[
BatchEncoding
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"text"
:
prompt
,
...
@@ -218,11 +331,37 @@ class HfRunner:
...
@@ -218,11 +331,37 @@ class HfRunner:
}
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
processor_kwargs
[
"images"
]
=
images
[
i
]
if
videos
is
not
None
and
videos
[
i
]
is
not
None
:
processor_kwargs
[
"videos"
]
=
videos
[
i
]
if
audios
is
not
None
and
audios
[
i
]
is
not
None
:
audio
,
sr
=
audios
[
i
]
processor_kwargs
[
"audio"
]
=
audio
processor_kwargs
[
"sampling_rate"
]
=
sr
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
inputs
=
self
.
postprocess_inputs
(
inputs
)
all_inputs
.
append
(
inputs
)
return
all_inputs
def
generate
(
self
,
prompts
:
List
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
outputs
:
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]
=
[]
for
inputs
in
all_inputs
:
output_ids
=
self
.
model
.
generate
(
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
use_cache
=
True
,
use_cache
=
True
,
**
kwargs
,
**
kwargs
,
)
)
...
@@ -239,13 +378,17 @@ class HfRunner:
...
@@ -239,13 +378,17 @@ class HfRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
images
=
images
,
images
=
images
,
videos
=
videos
,
audios
=
audios
,
**
kwargs
)
**
kwargs
)
return
[(
output_ids
[
0
],
output_str
[
0
])
return
[(
output_ids
[
0
],
output_str
[
0
])
...
@@ -276,22 +419,20 @@ class HfRunner:
...
@@ -276,22 +419,20 @@ class HfRunner:
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
List
[
torch
.
Tensor
]]:
)
->
List
[
List
[
torch
.
Tensor
]]:
all_logprobs
:
List
[
List
[
torch
.
Tensor
]]
=
[]
all_inputs
=
self
.
get_inputs
(
prompts
,
for
i
,
prompt
in
enumerate
(
prompts
):
images
=
images
,
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
videos
=
videos
,
"text"
:
prompt
,
audios
=
audios
)
"return_tensors"
:
"pt"
,
}
if
images
is
not
None
and
images
[
i
]
is
not
None
:
processor_kwargs
[
"images"
]
=
images
[
i
]
inputs
=
self
.
processor
(
**
processor_kwargs
)
all_logprobs
:
List
[
List
[
torch
.
Tensor
]]
=
[]
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
use_cache
=
True
,
use_cache
=
True
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
...
@@ -299,45 +440,140 @@ class HfRunner:
...
@@ -299,45 +440,140 @@ class HfRunner:
return_dict_in_generate
=
True
,
return_dict_in_generate
=
True
,
**
kwargs
,
**
kwargs
,
)
)
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
for
hidden_states
in
output
.
hidden_states
:
output
.
hidden_states
)
last_hidden_states
=
hidden_states
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
self
.
model
.
get_output_embeddings
().
bias
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
all_logprobs
.
append
(
seq_logprobs
)
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
return
all_logprobs
def
_hidden_states_to_seq_logprobs
(
self
,
hidden_states
:
Tuple
[
Tuple
[
torch
.
Tensor
,
...],
...],
)
->
List
[
torch
.
Tensor
]:
output_embeddings
=
self
.
model
.
get_output_embeddings
()
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
.
to
(
output_embeddings
.
weight
.
device
),
output_embeddings
.
weight
.
t
(),
)
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
logits
+=
output_embeddings
.
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
return
seq_logprobs
def
_hidden_states_to_logprobs
(
self
,
hidden_states
:
Tuple
[
Tuple
[
torch
.
Tensor
,
...],
...],
num_logprobs
:
int
,
)
->
Tuple
[
List
[
Dict
[
int
,
float
]],
int
]:
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
output_len
=
len
(
hidden_states
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
for
token_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
return
(
seq_logprobs_lst
,
output_len
,
)
def
generate_greedy_logprobs_limit
(
def
generate_greedy_logprobs_limit
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]]:
)
->
List
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
for
inputs
in
all_inputs
:
processor_kwargs
:
Dict
[
str
,
Any
]
=
{
output
=
self
.
model
.
generate
(
"text"
:
prompt
,
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
"return_tensors"
:
"pt"
,
use_cache
=
True
,
}
do_sample
=
False
,
if
images
is
not
None
and
images
[
i
]
is
not
None
:
max_new_tokens
=
max_tokens
,
processor_kwargs
[
"images"
]
=
images
[
i
]
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
**
kwargs
,
)
inputs
=
self
.
processor
(
**
processor_kwargs
)
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
output
.
hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
output_len
=
len
(
seq_logprobs_lst
)
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
generate_encoder_decoder_greedy_logprobs_limit
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
(
encoder_prompt
,
decoder_prompt
)
in
to_enc_dec_tuple_list
(
encoder_decoder_prompts
):
encoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
encoder_prompt
,
return_tensors
=
"pt"
).
input_ids
,
device
=
self
.
model
.
device
.
type
,
)
if
decoder_prompt
is
None
:
decoder_input_ids
=
None
else
:
decoder_input_ids
=
self
.
wrap_device
(
self
.
tokenizer
(
decoder_prompt
,
return_tensors
=
"pt"
).
input_ids
,
device
=
self
.
model
.
device
.
type
,
)
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
),
encoder_input_ids
,
decoder_input_ids
=
decoder_input_ids
,
use_cache
=
True
,
use_cache
=
True
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
...
@@ -346,37 +582,14 @@ class HfRunner:
...
@@ -346,37 +582,14 @@ class HfRunner:
**
kwargs
,
**
kwargs
,
)
)
seq_logprobs
:
List
[
torch
.
Tensor
]
=
[]
(
for
_
,
hidden_states
in
enumerate
(
output
.
hidden_states
):
seq_logprobs_lst
,
last_hidden_states
=
hidden_states
[
-
1
][
0
]
output_len
,
logits
=
torch
.
matmul
(
)
=
self
.
_hidden_states_to_logprobs
(
output
.
decoder_hidden_states
,
last_hidden_states
,
num_logprobs
)
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
getattr
(
self
.
model
.
get_output_embeddings
(),
"bias"
,
None
)
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
# convert to dict
seq_logprobs_lst
:
List
[
Dict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
tok_logprobs
=
tok_logprobs
[
-
1
,
:].
reshape
(
1
,
-
1
)
topk
=
tok_logprobs
.
topk
(
num_logprobs
)
tok_logprobs_dct
=
{}
for
token_id
,
logprob
in
zip
(
topk
.
indices
[
0
],
topk
.
values
[
0
]):
tok_logprobs_dct
[
token_id
.
item
()]
=
logprob
.
item
()
seq_logprobs_lst
.
append
(
tok_logprobs_dct
)
all_logprobs
.
append
(
seq_logprobs_lst
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
output
.
sequences
[
0
]
seq_ids
=
output
.
sequences
[
0
]
output_len
=
len
(
seq_logprobs_lst
)
output_ids
=
seq_ids
[
-
output_len
:]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
...
@@ -416,7 +629,7 @@ class VllmRunner:
...
@@ -416,7 +629,7 @@ class VllmRunner:
block_size
:
int
=
16
,
block_size
:
int
=
16
,
enable_chunked_prefill
:
bool
=
False
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
self
.
model
=
LLM
(
self
.
model
=
LLM
(
...
@@ -434,20 +647,50 @@ class VllmRunner:
...
@@ -434,20 +647,50 @@ class VllmRunner:
**
kwargs
,
**
kwargs
,
)
)
def
ge
nerate
(
def
ge
t_inputs
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
TextPrompt
]:
if
images
is
not
None
:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
if
videos
is
not
None
:
assert
len
(
prompts
)
==
len
(
videos
)
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
inputs
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
inputs
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
if
images
is
not
None
:
if
images
is
not
None
:
for
i
,
image
in
enumerate
(
images
):
for
i
,
image
in
enumerate
(
images
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
if
videos
is
not
None
:
for
i
,
video
in
enumerate
(
videos
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"video"
:
video
}
if
audios
is
not
None
:
for
i
,
audio
in
enumerate
(
audios
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"audio"
:
audio
}
return
inputs
def
generate
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
...
@@ -465,41 +708,79 @@ class VllmRunner:
...
@@ -465,41 +708,79 @@ class VllmRunner:
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
return
outputs
@
staticmethod
def
_final_steps_generate_w_logprobs
(
req_outputs
:
List
[
RequestOutput
],
)
->
List
[
TokensTextLogprobsPromptLogprobs
]:
outputs
:
List
[
TokensTextLogprobsPromptLogprobs
]
=
[]
for
req_output
in
req_outputs
:
assert
len
(
req_output
.
outputs
)
>
0
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
,
req_output
.
prompt_logprobs
))
return
outputs
def
generate_w_logprobs
(
def
generate_w_logprobs
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
audios
:
Optional
[
PromptAudioInput
]
=
None
,
assert
sampling_params
.
logprobs
is
not
None
videos
:
Optional
[
PromptVideoInput
]
=
None
,
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
if
images
is
not
None
:
req_outputs
=
self
.
model
.
generate
(
inputs
,
assert
len
(
prompts
)
==
len
(
image
s
)
sampling_params
=
sampling_param
s
)
inputs
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
toks_str_logsprobs_prompt_logprobs
=
(
if
images
is
not
None
:
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
for
i
,
image
in
enumerate
(
images
):
# Omit prompt logprobs if not required by sampling params
inputs
[
i
][
"multi_modal_data"
]
=
{
"image"
:
image
}
return
([
x
[
0
:
-
1
]
for
x
in
toks_str_logsprobs_prompt_logprobs
]
if
sampling_params
.
prompt_logprobs
is
None
else
toks_str_logsprobs_prompt_logprobs
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
def
generate_encoder_decoder_w_logprobs
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
sampling_params
:
SamplingParams
,
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
toks_str_logsprobs_prompt_logprobs
=
(
for
req_output
in
req_outputs
:
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
for
sample
in
req_output
.
outputs
:
# Omit prompt logprobs if not required by sampling params
output_str
=
sample
.
text
return
([
x
[
0
:
-
1
]
for
x
in
toks_str_logsprobs_prompt_logprobs
]
output_ids
=
sample
.
token_ids
if
sampling_params
.
prompt_logprobs
is
None
else
output_logprobs
=
sample
.
logprobs
toks_str_logsprobs_prompt_logprobs
)
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
return
outputs
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
List
[
Image
.
Image
]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
images
=
images
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
return
[(
output_ids
[
0
],
output_str
[
0
])
return
[(
output_ids
[
0
],
output_str
[
0
])
for
output_ids
,
output_str
in
outputs
]
for
output_ids
,
output_str
in
outputs
]
...
@@ -508,33 +789,62 @@ class VllmRunner:
...
@@ -508,33 +789,62 @@ class VllmRunner:
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
Union
[
List
[
Image
.
Image
],
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
List
[
List
[
Image
.
Image
]]]]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
Union
[
List
[
TokensTextLogprobs
],
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
List
[
TokensTextLogprobsPromptLogprobs
]]:
max_tokens
=
max_tokens
,
greedy_logprobs_params
=
SamplingParams
(
logprobs
=
num_logprobs
,
temperature
=
0.0
,
stop_token_ids
=
stop_token_ids
)
max_tokens
=
max_tokens
,
outputs
=
self
.
generate_w_logprobs
(
prompts
,
logprobs
=
num_logprobs
,
greedy_logprobs_params
,
prompt_logprobs
=
num_prompt_logprobs
,
images
=
images
)
stop_token_ids
=
stop_token_ids
)
return
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
,
images
=
images
,
audios
=
audios
,
videos
=
videos
)
def
generate_encoder_decoder_greedy_logprobs
(
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
(
num_prompt_logprobs
),
)
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
return
[(
output_ids
,
output_str
,
output
_logprobs
)
return
self
.
generate_encoder_decoder_w
_logprobs
(
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
encoder_decoder_prompts
,
greedy_logprobs_params
)
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
Union
[
List
[
str
],
List
[
List
[
int
]]],
beam_width
:
int
,
beam_width
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
beam_search_params
=
SamplingParams
(
n
=
beam_width
,
outputs
=
self
.
model
.
beam_search
(
use_beam_search
=
True
,
prompts
,
temperature
=
0.0
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
max_tokens
=
max_tokens
)
returned_outputs
=
[]
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
for
output
in
outputs
:
return
outputs
token_ids
=
[
x
.
tokens
for
x
in
output
.
sequences
]
texts
=
[
x
.
text
for
x
in
output
.
sequences
]
returned_outputs
.
append
((
token_ids
,
texts
))
return
returned_outputs
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
req_outputs
=
self
.
model
.
encode
(
prompts
)
...
@@ -593,3 +903,66 @@ def num_gpus_available():
...
@@ -593,3 +903,66 @@ def num_gpus_available():
in current process."""
in current process."""
return
cuda_device_count_stateless
()
return
cuda_device_count_stateless
()
temp_dir
=
tempfile
.
gettempdir
()
_dummy_opt_path
=
os
.
path
.
join
(
temp_dir
,
"dummy_opt"
)
_dummy_llava_path
=
os
.
path
.
join
(
temp_dir
,
"dummy_llava"
)
_dummy_gemma2_embedding_path
=
os
.
path
.
join
(
temp_dir
,
"dummy_gemma2_embedding"
)
@
pytest
.
fixture
def
dummy_opt_path
():
json_path
=
os
.
path
.
join
(
_dummy_opt_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_opt_path
):
snapshot_download
(
repo_id
=
"facebook/opt-125m"
,
local_dir
=
_dummy_opt_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyOPTForCausalLM"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_opt_path
@
pytest
.
fixture
def
dummy_llava_path
():
json_path
=
os
.
path
.
join
(
_dummy_llava_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_llava_path
):
snapshot_download
(
repo_id
=
"llava-hf/llava-1.5-7b-hf"
,
local_dir
=
_dummy_llava_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyLlava"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_llava_path
@
pytest
.
fixture
def
dummy_gemma2_embedding_path
():
json_path
=
os
.
path
.
join
(
_dummy_gemma2_embedding_path
,
"config.json"
)
if
not
os
.
path
.
exists
(
_dummy_gemma2_embedding_path
):
snapshot_download
(
repo_id
=
"BAAI/bge-multilingual-gemma2"
,
local_dir
=
_dummy_gemma2_embedding_path
,
ignore_patterns
=
[
"*.bin"
,
"*.bin.index.json"
,
"*.pt"
,
"*.h5"
,
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyGemma2Embedding"
]
with
open
(
json_path
,
"w"
)
as
f
:
json
.
dump
(
config
,
f
)
return
_dummy_gemma2_embedding_path
tests/core/block/e2e/test_correctness.py
View file @
ad385667
...
@@ -21,32 +21,32 @@ from .conftest import get_token_ids_from_llm_generator
...
@@ -21,32 +21,32 @@ from .conftest import get_token_ids_from_llm_generator
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
"preemption_mode"
:
"swap"
},
{
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
"preemption_mode"
:
"recompute"
}])
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_with_preemption
(
baseline_llm_generator
,
def
test_block_manager_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
"""Verify block manager produces same outputs even when there is preemption.
when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted
in the v2 block manager
.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
"""
output_len
=
1024
output_len
=
1024
temperature
=
0.0
temperature
=
0.0
...
@@ -70,78 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -70,78 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
temperature
=
temperature
,
temperature
=
temperature
,
)
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
for
expected_token_ids
,
actual_token_ids
in
zip
(
baseline_token_ids
,
test_token_ids
):
assert
expected_token_ids
==
actual_token_ids
assert
baseline_token_ids
==
test_token_ids
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
# Use a small model for a fast test.
"model"
:
"facebook/opt-125m"
,
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
# Use a large block size to trigger more copy-on-writes.
"block_size"
:
32
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_v1_v2_greedy_equality_with_cow
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify beam search equality with block manager v1 and v2.
This requires copy-on-writes; if the v1 and v2 output is the same, then
we have some confidence cow is working.
"""
output_len
=
128
temperature
=
0.0
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompts
=
[
prompt
for
prompt
,
_
in
zip
(
cycle
(
prompts
),
range
(
batch_size
))]
sampling_params
=
SamplingParams
(
max_tokens
=
output_len
,
ignore_eos
=
True
,
temperature
=
temperature
,
use_beam_search
=
True
,
best_of
=
2
,
)
print
(
'Getting token ids from block manager v1'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager v2'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
prompts
,
sampling_params
)
...
@@ -164,9 +95,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
...
@@ -164,9 +95,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
# skip cuda graph creation for fast test.
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Lookahead scheduling only supported in v2 block manager.
"use_v2_block_manager"
:
True
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -261,32 +189,39 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -261,32 +189,39 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
# skip cuda graph creation for fast test.
# skip cuda graph creation for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
"enable_chunked_prefill"
:
True
,
"enable_chunked_prefill"
:
True
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
"block_size"
:
8
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
8
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[
{
{},
"use_v2_block_manager"
:
False
,
},
])
])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
0
,
"num_lookahead_slots"
:
0
,
},
},
{
{
"use_v2_block_manager"
:
True
,
"num_lookahead_slots"
:
5
,
"num_lookahead_slots"
:
5
,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_chunked_prefill_block_manager
_v2
(
baseline_llm_generator
,
def
test_chunked_prefill_block_manager
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
test_llm_generator
,
batch_size
):
"""Verify that chunked prefill works with
BlockManagerV2, with and without
"""Verify that chunked prefill works with
SelfAttnBlockSpaceManager,
lookahead scheduling.
with and without
lookahead scheduling.
"""
"""
output_len
=
32
output_len
=
32
temperature
=
0.0
temperature
=
0.0
...
@@ -294,6 +229,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
...
@@ -294,6 +229,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
(
"1 + "
*
50
)
+
" 1 = "
,
# Longer prompt.
"The capital of France is"
,
"The capital of France is"
,
"The future of AI is"
,
"The future of AI is"
,
]
]
...
@@ -306,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
...
@@ -306,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
temperature
=
temperature
,
temperature
=
temperature
,
)
)
print
(
'Getting token ids with BlockManager
V1
'
)
print
(
'Getting token ids with BlockManager'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids with BlockManager
V2
'
)
print
(
'Getting token ids with BlockManager
, with lookahead slots.
'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
prompts
,
sampling_params
)
...
@@ -338,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
...
@@ -338,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
"enable_prefix_caching"
:
True
,
"enable_prefix_caching"
:
True
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
"use_v2_block_manager"
:
False
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"swap"
"preemption_mode"
:
"swap"
},
{
},
{
"use_v2_block_manager"
:
True
,
"preemption_mode"
:
"recompute"
"preemption_mode"
:
"recompute"
}])
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_
v1_v2_greedy_equality
_prefix_caching_enabled_with_preemption
(
def
test_
block_manager
_prefix_caching_enabled_with_preemption
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
baseline_llm_generator
,
test_llm_generator
,
batch_size
):
"""Verify block manager v2 produces same outputs as block manager v1, even
"""Verify block manager produces same outputs even when there is preemption.
when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted
in the v2 block manager
.
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
"""
output_len
=
1024
output_len
=
1024
temperature
=
0.0
temperature
=
0.0
...
@@ -387,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
...
@@ -387,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
temperature
=
temperature
,
temperature
=
temperature
,
)
)
print
(
'Getting token ids from block manager
v1
'
)
print
(
'Getting token ids from block manager'
)
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_token_ids
=
get_token_ids_from_llm_generator
(
baseline_llm_generator
,
prompts
,
sampling_params
)
baseline_llm_generator
,
prompts
,
sampling_params
)
print
(
'Getting token ids from block manager
v2
'
)
print
(
'Getting token ids from block manager
, with preemption
'
)
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
test_token_ids
=
get_token_ids_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
prompts
,
sampling_params
)
...
@@ -414,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
...
@@ -414,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
16
,
"block_size"
:
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Test APC in v2 block
"use_v2_block_manager"
:
True
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
...
@@ -492,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
...
@@ -492,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"max_model_len"
:
48
,
"max_model_len"
:
48
,
"block_size"
:
16
,
"block_size"
:
16
,
"num_gpu_blocks_override"
:
3
,
"num_gpu_blocks_override"
:
3
,
# Test APC in v2 block
"use_v2_block_manager"
:
True
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
ad385667
...
@@ -24,10 +24,8 @@ BLOCK_SIZE = 16
...
@@ -24,10 +24,8 @@ BLOCK_SIZE = 16
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
"use_v2_block_manager"
:
False
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"use_v2_block_manager"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_retrival
(
baseline_llm_generator
,
test_llm_generator
,
def
test_sliding_window_retrival
(
baseline_llm_generator
,
test_llm_generator
,
...
@@ -48,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
...
@@ -48,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
)
prompts
,
answer
,
indices
=
prep_prompts
(
batch_size
)
print
(
'Getting token ids from block manager v1'
)
baseline_texts
=
get_text_from_llm_generator
(
baseline_llm_generator
,
baseline_texts
=
get_text_from_llm_generator
(
baseline_llm_generator
,
prompts
,
prompts
,
sampling_params
,
sampling_params
,
...
@@ -84,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
...
@@ -84,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
"num_gpu_blocks_override"
:
100000
//
BLOCK_SIZE
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"enable_chunked_prefill"
:
True
}])
"use_v2_block_manager"
:
True
,
"enable_chunked_prefill"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
):
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
):
...
...
tests/core/block/test_block_manager
_v2
.py
→
tests/core/block/test_block_manager.py
View file @
ad385667
...
@@ -2,7 +2,7 @@ import pytest
...
@@ -2,7 +2,7 @@ import pytest
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager
_v2
import
BlockSpaceManager
V2
from
vllm.core.block_manager
import
SelfAttn
BlockSpaceManager
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
from
vllm.utils
import
chunk_list
...
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
...
@@ -17,7 +17,7 @@ from ..utils import (create_dummy_prompt, create_seq_group,
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
@
pytest
.
mark
.
parametrize
(
"watermark"
,
[
0.0
,
0.5
])
def
test_can_allocate_seq_group
(
block_size
:
int
,
num_seqs_per_group
:
int
,
def
test_can_allocate_seq_group
(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
num_gpu_blocks
:
int
,
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
num_cpu_blocks
=
1024
,
...
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
...
@@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
num_seqs_per_group
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
num_cpu_blocks
=
1024
,
...
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
...
@@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
'''
SWA short for Sliding Window Attention.
SWA short for Sliding Window Attention.
At time of writing block manager
v2
does not support SWA.
At time of writing block manager does not support SWA.
However even when SWA is implemented for block manager
v2
,
However even when SWA is implemented for block manager,
there will still most likely be a separate workstream required
there will still most likely be a separate workstream required
to enable SWA for encoder/decoder models.
to enable SWA for encoder/decoder models.
Therefore this test enforces that one of the following cases
Therefore this test enforces that one of the following cases
hold true:
hold true:
1. Block manager
v2
does not support SWA at all (true at time of writing)
1. Block manager does not support SWA at all (true at time of writing)
2. Block manager
v2
fails with NotImplementError when SWA is enabled
2. Block manager fails with NotImplementError when SWA is enabled
AND a SequenceGroup with an encoder sequence (i.e. in support of an
AND a SequenceGroup with an encoder sequence (i.e. in support of an
encoder/decoder model) is passed into can_allocate() as an argument
encoder/decoder model) is passed into can_allocate() as an argument
...
@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
...
@@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
'''
'''
with
pytest
.
raises
((
NotImplementedError
,
AssertionError
))
as
exc_info
:
with
pytest
.
raises
((
NotImplementedError
,
AssertionError
))
as
exc_info
:
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
num_cpu_blocks
=
1024
,
...
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
...
@@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
block_manager
.
can_allocate
(
seq_group
)
block_manager
.
can_allocate
(
seq_group
)
# Assert that either
# Assert that either
# 1. Block manager
v2
constructor fails with assertion that sliding window
# 1. Block manager constructor fails with assertion that sliding window
# is not yet supported (most likely near-term outcome at time of
# is not yet supported (most likely near-term outcome at time of
# writing), or
# writing), or
# 2. can_allocate() fails with NotImplementedError due to combination of
# 2. can_allocate() fails with NotImplementedError due to combination of
...
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
...
@@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
block_size
:
int
,
num_seqs_per_group
:
int
,
num_gpu_blocks
:
int
,
watermark
:
float
):
watermark
:
float
):
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
1024
,
num_cpu_blocks
=
1024
,
...
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
...
@@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks
=
1024
num_gpu_blocks
=
1024
watermark
=
0.1
watermark
=
0.1
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
num_cpu_blocks
=
0
,
...
@@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
...
@@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
"""Verify blocks number on src/desc device is correct after swapping in/out
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
sequence group (not missing or extra blocks).
"""
"""
block_manager
=
BlockSpaceManager
V2
(
block_size
,
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_cpu_blocks
,
num_gpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
watermark
=
0
,
enable_caching
=
enable_caching
)
enable_caching
=
enable_caching
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
# tokens will be written in the next forward pass.
...
@@ -311,6 +312,114 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
...
@@ -311,6 +312,114 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
3
,
8
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
True
,
False
])
def
test_can_swap
(
block_size
,
num_gpu_blocks
,
num_lookahead_slots
,
enable_caching
):
""" Verify the block manager can correctly determine if a sequence group
can be swapped in/out.
"""
num_cpu_blocks
=
num_gpu_blocks
block_manager
=
SelfAttnBlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
(
num_gpu_blocks
-
1
)
*
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
prompt
.
status
=
SequenceStatus
.
RUNNING
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# At this moment, we still have enough free blocks to swap in the seq group.
if
num_lookahead_slots
<=
block_size
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
OK
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
# During Swapped out, 2 cached blocks were evicted from the GPU,
# so the prompt1 can't be swapped in
prompt2_len
=
2
*
block_size
-
1
prompt2
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
prompt2_len
,
prompt_tokens
=
[
10000
+
i
for
i
in
range
(
prompt2_len
)])
prompt2
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group2
)
# Swap seq group from CPU -> GPU.
if
num_lookahead_slots
<=
block_size
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
LATER
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
0
,
2
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
False
,
True
])
def
test_swap_in_infeasible
(
num_lookahead_slots
,
enable_caching
):
"""Verifies that swapping fails if there is not enough free blocks
to account for unseen tokens and lookahead_slots.
"""
block_size
=
8
num_cpu_blocks
=
1
num_gpu_blocks
=
1
block_manager
=
SelfAttnBlockSpaceManager
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt_length
=
block_size
-
3
assert
prompt_length
>
0
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
prompt_length
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
assert
block_manager
.
can_swap_out
(
seq_group
)
block_manager
.
swap_out
(
seq_group
)
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
# The number of unseen tokens is 1. If the number of existing
# tokens plus the unseen ones and number of lookahead slots exceeds
# the total number of available GPU blocks then the swap
# should fail.
num_unseen_tokens
=
1
if
(
num_lookahead_slots
+
num_unseen_tokens
+
prompt_length
)
<=
(
block_size
*
num_gpu_blocks
):
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
OK
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
...
@@ -326,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
...
@@ -326,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
num_gpu_blocks
=
1024
num_gpu_blocks
=
1024
watermark
=
0.1
watermark
=
0.1
block_manager
=
BlockSpaceManager
V2
(
block_manager
=
SelfAttn
BlockSpaceManager
(
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
0
,
num_cpu_blocks
=
0
,
...
@@ -338,7 +447,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
...
@@ -338,7 +447,6 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
if
max_n
is
None
:
if
max_n
is
None
:
max_n
=
min_n
max_n
=
min_n
used
=
num_gpu_blocks
-
block_manager
.
get_num_free_gpu_blocks
()
used
=
num_gpu_blocks
-
block_manager
.
get_num_free_gpu_blocks
()
#print("check", min_n, used, max_n)
assert
min_n
<=
used
assert
min_n
<=
used
assert
used
<=
max_n
assert
used
<=
max_n
...
@@ -367,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
...
@@ -367,7 +475,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append,
seq
.
data
.
update_num_computed_tokens
(
prompt_len
)
seq
.
data
.
update_num_computed_tokens
(
prompt_len
)
check_used
(
num_blocks
(
prompt_len
))
check_used
(
num_blocks
(
prompt_len
))
# this is how we compute it in BlockSpaceManager
V2
.__init__
# this is how we compute it in
SelfAttn
BlockSpaceManager.__init__
sliding_blocks
=
(
sliding_window
//
block_size
)
+
2
sliding_blocks
=
(
sliding_window
//
block_size
)
+
2
# plus one block for null block
# plus one block for null block
sliding_blocks
+=
1
sliding_blocks
+=
1
...
...
tests/core/block/test_naive_block.py
View file @
ad385667
...
@@ -100,3 +100,46 @@ class TestNaiveBlockAllocator:
...
@@ -100,3 +100,46 @@ class TestNaiveBlockAllocator:
for
i
,
block
in
enumerate
(
blocks
):
for
i
,
block
in
enumerate
(
blocks
):
assert
allocator
.
get_num_free_blocks
()
==
i
assert
allocator
.
get_num_free_blocks
()
==
i
allocator
.
free
(
block
)
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_naive_block_get_num_full_blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
full blocks touched.
"""
allocator_src
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocator_dst
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create a chain of cacheable blocks in the dst
allocate_block
=
TestNaiveBlockAllocator
.
create_allocate_lambda
(
"immutable"
,
allocator_src
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)))
src_blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
-
1
)]
# All blocks are cached
assert
allocator_dst
.
get_num_full_blocks_touched
(
src_blocks
)
==
num_blocks
-
1
# Insert one non-full block in the src
allocate_non_full_block
=
\
TestNaiveBlockAllocator
.
create_allocate_lambda
(
"mutable"
,
allocator_src
,
prev_block
=
src_blocks
[
-
1
],
token_ids
=
[]
)
src_blocks
.
append
(
allocate_non_full_block
())
src_blocks
[
-
1
].
append_token_ids
([
0
])
assert
allocator_dst
.
get_num_full_blocks_touched
(
src_blocks
)
==
num_blocks
-
1
# Fill up the last source block and then invoke
# get_num_blocks_touched
src_blocks
[
-
1
].
append_token_ids
([
0
]
*
(
block_size
-
1
))
assert
allocator_dst
.
get_num_full_blocks_touched
(
src_blocks
)
==
num_blocks
tests/core/block/test_prefix_caching_block.py
View file @
ad385667
...
@@ -315,6 +315,61 @@ class TestPrefixCachingBlockAllocator:
...
@@ -315,6 +315,61 @@ class TestPrefixCachingBlockAllocator:
i
)
i
)
allocator
.
free
(
block
)
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_prefix_caching_block_get_num_full_blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
blocks touched, when there are cached prefixes.
"""
allocator_src
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocator_dst
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create token ids that will exhaust all blocks except the last
token_ids
=
list
(
range
((
num_blocks
-
1
)
*
block_size
))
# Create a chain of cacheable blocks in the dst
cached_blocks
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator_dst
,
)
# Create a chain of the same blocks in the src
blocks_to_swap_in
=
\
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator_src
,
)
# All blocks are cached
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
0
# Free the first block in the dst
allocator_dst
.
free
(
cached_blocks
[
0
])
# Now the first block becomes dangling, the swapped blocks need
# to reclaim the first block in the dst
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
1
# Insert one non-full block in the src
non_full_block
=
allocator_src
.
allocate_mutable_block
(
blocks_to_swap_in
[
-
1
])
non_full_block
.
append_token_ids
([
0
])
blocks_to_swap_in
.
append
(
non_full_block
)
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
1
# Fill up the last mutable block and invoke get_num_blocks_touched.
# Note: The last block is not cached so it will be touched.
non_full_block
.
append_token_ids
([
0
]
*
(
block_size
-
1
))
assert
allocator_dst
.
get_num_full_blocks_touched
(
blocks_to_swap_in
)
==
2
@
staticmethod
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
...
@@ -628,6 +683,63 @@ class TestPrefixCachingBlockAllocator:
...
@@ -628,6 +683,63 @@ class TestPrefixCachingBlockAllocator:
assert
new_block
[
0
].
block_id
==
last_block_id
assert
new_block
[
0
].
block_id
==
last_block_id
# Test case for cache mertics
@
staticmethod
def
test_metric
():
block_size
=
16
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
4
,
block_size
=
block_size
)
# Test when no query (0/0)
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.0
token_ids
=
list
(
range
(
block_size
))
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
# Test 0/1 hit rate
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.0
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
# Test 1/2 hit rate
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.5
# Test more than one block
for
_
in
range
(
2
,
1005
):
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
assert
allocator
.
get_prefix_cache_hit_rate
()
>
0.99
# Test case for marking cache hit blocks as computed right after
# a batch of prefill sequences are scheduled.
@
staticmethod
def
test_touch_block
():
block_size
=
16
common_blocks
=
4
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
8
,
block_size
=
block_size
)
common_token_ids
=
list
(
range
(
block_size
*
common_blocks
))
# Mimic the behavior of allocating the same block chain
# (i.e., common prefix) for a batch of 3 different prefill sequences.
for
_
in
range
(
3
):
blocks
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
common_token_ids
,
allocator
=
allocator
,
)
block_ids
=
[
block
.
block_id
for
block
in
blocks
]
# The allocated blocks should be marked as touched
# but not computed.
computed_block_ids
=
allocator
.
get_computed_block_ids
(
[],
block_ids
,
skip_last_block_id
=
False
)
assert
len
(
computed_block_ids
)
==
0
allocator
.
mark_blocks_as_computed
([])
computed_block_ids
=
allocator
.
get_computed_block_ids
(
[],
block_ids
,
skip_last_block_id
=
False
)
assert
len
(
computed_block_ids
)
==
common_blocks
@
staticmethod
@
staticmethod
def
create_immutable_chain
(
def
create_immutable_chain
(
block_size
:
int
,
block_size
:
int
,
...
...
tests/core/test_block_manager.py
deleted
100644 → 0
View file @
be0967c1
import
time
from
collections
import
defaultdict
from
typing
import
List
import
pytest
from
vllm
import
SamplingParams
from
vllm.block
import
PhysicalTokenBlock
from
vllm.core.block.utils
import
(
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
,
STR_NOT_IMPL_ENC_DEC_SWA
)
from
vllm.core.block_manager_v1
import
(
BlockSpaceManagerV1
,
UncachedBlockAllocator
)
from
vllm.core.interfaces
import
AllocStatus
from
vllm.sequence
import
Logprob
,
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
from
.utils
import
create_dummy_prompt
,
create_dummy_prompt_encoder_decoder
def
test_block_allocator_allocate
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
num_free
=
num_cpu_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
num_free
-=
1
assert
block
not
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
allocate
()
def
test_block_allocator_free
():
block_size
=
4
num_cpu_blocks
=
4
cpu_allocator
=
UncachedBlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Allocate all available cpu blocks.
blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
_
in
range
(
num_cpu_blocks
):
block
=
cpu_allocator
.
allocate
()
blocks
.
append
(
block
)
assert
block
not
in
cpu_allocator
.
free_blocks
# Free all allocated cpu blocks.
num_free
=
0
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
for
block
in
blocks
:
cpu_allocator
.
free
(
block
)
num_free
+=
1
assert
block
in
cpu_allocator
.
free_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_free
with
pytest
.
raises
(
ValueError
):
cpu_allocator
.
free
(
block
)
def
test_allocate
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
(
num_gpu_blocks
-
1
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same sequence group to all available gpu blocks.
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
# Allocate same sequence group to all available gpu blocks.
# Use watermark to reserve one gpu block.
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
1
/
num_gpu_blocks
)
for
i
in
range
((
num_gpu_blocks
-
1
)
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
str
(
i
),
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
assert
block_manager
.
can_allocate
(
seq_group
)
==
AllocStatus
.
OK
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
can_allocate
(
seq_group
)
!=
AllocStatus
.
OK
def
test_allocate_encoder_decoder_fails_with_swa
():
# SWA short for sliding window attention
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
sliding_window
=
5
)
# swa
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
# Assert that allocate() fails due to SWA
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_SWA
def
test_allocate_encoder_decoder_fails_with_prefix_caching
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
True
)
# Prefix cache
# Allocate same sequence group to all available gpu blocks.
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
"0"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
# Assert that can_allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
can_allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
# Assert that allocate() fails due to prefix caching
with
pytest
.
raises
(
NotImplementedError
)
as
exc_info
:
block_manager
.
allocate
(
seq_group
)
assert
str
(
exc_info
.
value
)
==
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
def
test_append_slot_single_seq
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate single seq to gpu block.
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Nothing to append. Sequence has no new logical blocks.
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slots
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
==
after_blocks
# Add block_size number of new tokens and append slot.
for
i
in
range
(
block_size
):
token_id
=
i
+
5
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
not
block_manager
.
append_slots
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_append_slot_cow
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
=
block_size
,
num_cpu_blocks
=
num_cpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
watermark
=
0
)
# Allocate prompt to gpu block. There is one slot left in the block.
prompt
=
Sequence
(
seq_id
=
1
,
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
1
,
2
,
3
],
},
block_size
=
block_size
)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child
=
prompt
.
fork
(
new_seq_id
=
2
)
# Allocate space for the sequence group.
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
prompt
,
child
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
())
block_manager
.
allocate
(
seq_group
)
# Fork and append a new token id. We expect a COW to be scheduled.
token_id
=
4
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
can_append_slots
(
seq_group
)
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
cows
=
block_manager
.
append_slots
(
child
)
assert
cows
dict_cows
=
defaultdict
(
list
)
for
src_block
,
dst_block
in
cows
:
dict_cows
[
src_block
].
append
(
dst_block
)
for
src_block
,
dst_blocks
in
dict_cows
.
items
():
assert
src_block
not
in
dst_blocks
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_blocks
-
after_blocks
==
1
def
test_fork
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
-
1
,
block_size
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Fork prompt and copy block tables.
child
=
prompt
.
fork
(
2
)
block_manager
.
fork
(
prompt
,
child
)
assert
block_manager
.
get_block_table
(
prompt
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
child
)
assert
block_manager
.
get_block_table
(
prompt
)
!=
block_manager
.
get_block_table
(
child
)
def
test_swap
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_swap_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
decoder_prompt
.
status
=
SequenceStatus
.
WAITING
encoder_prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
decoder_prompt
.
status
=
SequenceStatus
.
RUNNING
decoder_prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap encoder/decoder seq group from GPU -> CPU.
decoder_gpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_gpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
gpu_blocks
=
decoder_gpu_blocks
+
cross_gpu_blocks
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
gpu_blocks
#assert list(mapping.keys()) == gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
decoder_prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap encoder/decoder seq group from CPU -> GPU.
decoder_cpu_blocks
=
block_manager
.
get_block_table
(
decoder_prompt
)
cross_cpu_blocks
=
block_manager
.
get_cross_block_table
(
seq_group
)
cpu_blocks
=
decoder_cpu_blocks
+
cross_cpu_blocks
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
assert
[
x
[
0
]
for
x
in
mapping
]
==
cpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
+
len
(
cpu_blocks
)
==
after_cpu_blocks
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
def
test_free
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
prompt_blocks
=
len
(
block_manager
.
get_block_table
(
prompt
))
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
prompt
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed seq is deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
prompt
)
def
test_free_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
decoder_prompt
,
encoder_prompt
,
seq_group
=
\
create_dummy_prompt_encoder_decoder
(
"1"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
# Free allocated seq.
decoder_prompt_blocks
=
len
(
block_manager
.
get_block_table
(
decoder_prompt
))
encoder_prompt_blocks
=
len
(
block_manager
.
get_cross_block_table
(
seq_group
))
prompt_blocks
=
decoder_prompt_blocks
+
encoder_prompt_blocks
before_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
block_manager
.
free
(
decoder_prompt
)
block_manager
.
free_cross
(
seq_group
)
after_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
after_blocks
==
before_blocks
+
prompt_blocks
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
decoder_prompt
)
# Block table for freed encoder & decoder seq's are deleted.
with
pytest
.
raises
(
KeyError
):
block_manager
.
get_block_table
(
encoder_prompt
)
def
test_reset
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_reset_encoder_decoder
():
block_size
=
4
num_cpu_blocks
=
4
num_gpu_blocks
=
4
block_req_per_seq_group
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
)
# Allocate same seq group on all available gpu blocks.
original_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
for
i
in
range
(
num_gpu_blocks
//
block_req_per_seq_group
):
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
f
"
{
i
}
"
,
decoder_prompt_length
=
block_size
,
encoder_prompt_length
=
block_size
)
block_manager
.
allocate
(
seq_group
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
# Resetting block manager frees all allocated blocks.
block_manager
.
reset
()
assert
block_manager
.
get_num_free_gpu_blocks
()
==
original_blocks
def
test_sliding_window_multi_seq
():
"""
Tests that memory allocation and deallocation is handled
correctly with multiple sequences that exceed the sliding
window's capacity.
"""
block_size
=
1
num_cpu_blocks
=
8
num_gpu_blocks
=
8
sliding_window
=
2
block_manager
=
BlockSpaceManagerV1
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
sliding_window
=
sliding_window
,
watermark
=
0
)
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
parent
=
Sequence
(
seq_id
=
1
,
inputs
=
{
"prompt"
:
"one two three"
,
"prompt_token_ids"
:
[
0
,
1
,
2
],
},
block_size
=
block_size
)
seq_group
=
SequenceGroup
(
request_id
=
"1"
,
seqs
=
[
parent
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(),
lora_request
=
None
)
block_manager
.
allocate
(
seq_group
)
# assert the number of blocks allocated is correct
# the parent seq has len 3, but since sliding_window is 2,
# we will use at most 2 blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# Fork prompt and copy block tables.
child
=
parent
.
fork
(
2
)
block_manager
.
fork
(
parent
,
child
)
# assert the number of blocks allocated is correct
# forking does not increase memory consumption
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# assert both parent and child share all blocks
assert
block_manager
.
get_block_table
(
parent
)
==
block_manager
.
get_block_table
(
child
)
token_id
=
4
# Append token to child. Block is shared so copy on write occurs.
child
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
child
)
# assert the number of blocks allocated is correct
# we will use now one block more. Each seq will use 2 blocks,
# but only one can be shared
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
token_id
=
5
parent
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
block_manager
.
append_slots
(
parent
)
# assert the number of blocks allocated is correct
# no change, because both sequences are still just sharing one block
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
-
1
block_table_parent
=
block_manager
.
get_block_table
(
parent
)
block_table_child
=
block_manager
.
get_block_table
(
child
)
assert
block_table_parent
!=
block_table_child
# assert both blocks are sharing the second-last block
assert
block_table_parent
[
-
2
]
==
block_table_child
[
-
2
]
# now let's clean up...
block_manager
.
free
(
parent
)
# assert the number of blocks allocated is correct
# We have freed one seq, reducing the ref count of two blocks by one.
# One of the two was only used by the parent seq, so this is now free.
# The child seq still consumes sliding_window blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
sliding_window
# free all blocks
block_manager
.
free
(
child
)
# assert all blocks are free now
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
tests/core/test_chunked_prefill_scheduler.py
View file @
ad385667
...
@@ -21,7 +21,7 @@ def append_new_token(seq_group, token_id: int):
...
@@ -21,7 +21,7 @@ def append_new_token(seq_group, token_id: int):
def
schedule_and_update_computed_tokens
(
scheduler
):
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
metas
,
out
,
_
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
return
metas
,
out
...
@@ -45,7 +45,9 @@ def test_simple():
...
@@ -45,7 +45,9 @@ def test_simple():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -75,24 +77,29 @@ def test_chunk():
...
@@ -75,24 +77,29 @@ def test_chunk():
max_seqs
=
60
max_seqs
=
60
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
# Verify the second request is chunked.
# Verify the second request is chunked.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
print
()
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
# Verify it is chunked.
# Verify it is chunked.
...
@@ -118,19 +125,23 @@ def test_complex():
...
@@ -118,19 +125,23 @@ def test_complex():
max_seqs
=
60
max_seqs
=
60
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
64
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -151,7 +162,9 @@ def test_complex():
...
@@ -151,7 +162,9 @@ def test_complex():
# Add 2 more requests.
# Add 2 more requests.
for
i
in
range
(
2
,
4
):
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -180,12 +193,14 @@ def test_maximal_decoding():
...
@@ -180,12 +193,14 @@ def test_maximal_decoding():
"""Verify decoding requests are prioritized."""
"""Verify decoding requests are prioritized."""
block_size
=
4
block_size
=
4
max_seqs
=
2
max_seqs
=
2
max_model_len
=
2
max_model_len
=
8
max_num_batched_tokens
=
2
max_num_batched_tokens
=
2
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -194,7 +209,9 @@ def test_maximal_decoding():
...
@@ -194,7 +209,9 @@ def test_maximal_decoding():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -211,7 +228,9 @@ def test_maximal_decoding():
...
@@ -211,7 +228,9 @@ def test_maximal_decoding():
append_new_token
(
running
[
0
],
1
)
append_new_token
(
running
[
0
],
1
)
# Create one more seq_group.
# Create one more seq_group.
_
,
seq_group
=
create_dummy_prompt
(
"3"
,
prompt_length
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"3"
,
prompt_length
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -269,17 +288,21 @@ def test_prompt_limit():
...
@@ -269,17 +288,21 @@ def test_prompt_limit():
max_seqs
=
32
max_seqs
=
32
max_model_len
=
64
max_model_len
=
64
max_num_batched_tokens
=
32
max_num_batched_tokens
=
32
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -303,12 +326,13 @@ def test_prompt_limit_exceed():
...
@@ -303,12 +326,13 @@ def test_prompt_limit_exceed():
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
48
)
prompt_length
=
48
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -323,16 +347,21 @@ def test_swap():
...
@@ -323,16 +347,21 @@ def test_swap():
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -374,16 +403,21 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -374,16 +403,21 @@ def test_running_prefill_prioritized_over_swap():
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -413,7 +447,9 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -413,7 +447,9 @@ def test_running_prefill_prioritized_over_swap():
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group2
)
scheduler
.
add_seq_group
(
seq_group2
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
len
(
out
.
scheduled_seq_groups
)
==
1
...
@@ -461,16 +497,20 @@ def test_chunked_prefill_preempt():
...
@@ -461,16 +497,20 @@ def test_chunked_prefill_preempt():
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -522,17 +562,21 @@ def test_chunked_prefill_max_seqs():
...
@@ -522,17 +562,21 @@ def test_chunked_prefill_max_seqs():
max_seqs
=
2
max_seqs
=
2
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
12
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
12
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
# The first prefill is chunked.
# The first prefill is chunked.
...
@@ -542,7 +586,9 @@ def test_chunked_prefill_max_seqs():
...
@@ -542,7 +586,9 @@ def test_chunked_prefill_max_seqs():
# Add new requests.
# Add new requests.
for
i
in
range
(
4
):
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
65
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
65
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -562,3 +608,44 @@ def test_chunked_prefill_max_seqs():
...
@@ -562,3 +608,44 @@ def test_chunked_prefill_max_seqs():
assert
len
(
get_sequence_groups
(
out
))
==
max_seqs
assert
len
(
get_sequence_groups
(
out
))
==
max_seqs
assert
not
running
[
0
].
is_prefill
()
assert
not
running
[
0
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
def
test_perfix_caching
():
"""Verify allocating full blocks when prefix caching is enabled."""
block_size
=
4
max_seqs
=
10
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
,
enable_prefix_caching
=
True
)
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
=
block_size
,
prompt_length
=
50
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
50
# Verify it is chunked. Note that although the budget is 64-50=14,
# we only allocate full blocks for prefix caching, so only 4*(14//4)=12
# tokens are allocated.
assert
seq_group_meta
[
1
].
token_chunk_size
==
12
assert
out
.
num_prefill_groups
==
2
assert
out
.
num_batched_tokens
==
62
tests/core/test_num_computed_tokens_update.py
0 → 100644
View file @
ad385667
import
pytest
from
tests.conftest
import
VllmRunner
from
tests.core.utils
import
create_dummy_prompt
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SequenceGroup
MODEL
=
"JackFram/llama-160m"
def
add_seq_group_to_engine
(
engine
:
LLMEngine
,
seq_group
:
SequenceGroup
):
scheduler
=
engine
.
scheduler
[
0
]
scheduler
.
add_seq_group
(
seq_group
)
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
def
test_num_computed_tokens_update
(
num_scheduler_steps
:
int
,
enable_chunked_prefill
:
bool
,
enforce_eager
:
bool
):
is_multi_step
=
num_scheduler_steps
>
1
is_multi_step_chunked_prefill
=
is_multi_step
and
enable_chunked_prefill
if
is_multi_step_chunked_prefill
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Multi-step with Chunked-Prefill does not support "
"rocm_flash_attn backend"
)
# Make a vllm engine
runner
=
VllmRunner
(
model_name
=
MODEL
,
gpu_memory_utilization
=
0.7
,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
num_prompt_steps
=
num_scheduler_steps
\
if
is_multi_step_chunked_prefill
else
1
num_output_tokens_list
=
[
4
,
8
,
12
,
15
,
16
,
17
]
# Create sequence and add to engine
prompt_len
=
10
for
req_idx
,
num_output_tokens
in
enumerate
(
num_output_tokens_list
):
seq
,
seq_group
=
create_dummy_prompt
(
request_id
=
str
(
req_idx
),
prompt_length
=
prompt_len
,
min_tokens
=
num_output_tokens
,
max_tokens
=
num_output_tokens
)
add_seq_group_to_engine
(
engine
,
seq_group
)
assert
seq
.
data
.
get_num_computed_tokens
()
==
0
for
_
in
range
(
num_prompt_steps
):
# prompt steps
engine
.
step
()
if
not
seq
.
is_finished
():
prompt_num_computed_tokens
=
seq
.
data
.
get_num_computed_tokens
()
# Test correctness of num_computed_tokens after the prompt steps
assert
prompt_num_computed_tokens
==
\
prompt_len
+
num_prompt_steps
-
1
decode_step_counter
=
0
while
not
seq
.
is_finished
():
# Test correctness of num_computed_tokens after the decode steps
assert
seq
.
data
.
get_num_computed_tokens
(
)
==
prompt_num_computed_tokens
+
decode_step_counter
for
_
in
range
(
num_scheduler_steps
):
# decode step
engine
.
step
()
decode_step_counter
+=
1
# Test correctness of num_computed_tokens after the sequence finish.
assert
seq
.
data
.
get_num_computed_tokens
(
)
==
prompt_len
+
num_output_tokens
-
1
tests/core/test_scheduler.py
View file @
ad385667
...
@@ -4,43 +4,26 @@ from typing import List, Set, Tuple
...
@@ -4,43 +4,26 @@ from typing import List, Set, Tuple
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
# noqa
from
torch
import
Use
# noqa
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
Logprob
,
SequenceGroup
,
SequenceStatus
from
vllm.sequence
import
SequenceGroup
,
SequenceStatus
from
.utils
import
create_dummy_prompt
from
.utils
import
(
append_new_token
,
append_new_token_seq_group
,
create_dummy_prompt
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
out
,
token_id
:
int
):
seq_groups
=
get_sequence_groups
(
out
)
for
seq_group
in
seq_groups
:
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
append_new_token_seq_group
(
token_chunk_size
,
seq_group
,
token_id
:
int
):
seq_group
.
update_num_computed_tokens
(
token_chunk_size
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
test_scheduler_add_seq_group
():
def
test_scheduler_add_seq_group
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -49,14 +32,20 @@ def test_scheduler_add_seq_group():
...
@@ -49,14 +32,20 @@ def test_scheduler_add_seq_group():
# Add seq group to scheduler.
# Add seq group to scheduler.
num_seq_group
=
4
num_seq_group
=
4
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
def
test_scheduler_abort_seq_group
():
def
test_scheduler_abort_seq_group
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -80,7 +69,11 @@ def test_scheduler_schedule_simple():
...
@@ -80,7 +69,11 @@ def test_scheduler_schedule_simple():
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -89,7 +82,9 @@ def test_scheduler_schedule_simple():
...
@@ -89,7 +82,9 @@ def test_scheduler_schedule_simple():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -118,15 +113,18 @@ def test_scheduler_prefill_prioritized():
...
@@ -118,15 +113,18 @@ def test_scheduler_prefill_prioritized():
block_size
=
4
block_size
=
4
max_model_len
=
30
max_model_len
=
30
max_batched_num_tokens
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
2
,
scheduler_config
=
SchedulerConfig
(
max_model_len
)
max_batched_num_tokens
,
2
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
)
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
# Schedule seq groups prompts.
# Schedule seq groups prompts.
...
@@ -134,7 +132,7 @@ def test_scheduler_prefill_prioritized():
...
@@ -134,7 +132,7 @@ def test_scheduler_prefill_prioritized():
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
# Add a new prefill request B.
# Add a new prefill request B.
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
)
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Verify prefill requests are prioritized. Since max_batched_num_tokens
# Verify prefill requests are prioritized. Since max_batched_num_tokens
...
@@ -146,15 +144,23 @@ def test_scheduler_prefill_prioritized():
...
@@ -146,15 +144,23 @@ def test_scheduler_prefill_prioritized():
def
test_scheduler_schedule_preempt_abort
():
def
test_scheduler_schedule_preempt_abort
():
block_size
=
4
block_size
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
block_size
)
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
)
block_size
,
block_size
=
block_size
)
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
...
@@ -197,7 +203,11 @@ def test_scheduler_max_seqs():
...
@@ -197,7 +203,11 @@ def test_scheduler_max_seqs():
num_seq_group
=
4
num_seq_group
=
4
max_seq_group
=
2
max_seq_group
=
2
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -206,7 +216,9 @@ def test_scheduler_max_seqs():
...
@@ -206,7 +216,9 @@ def test_scheduler_max_seqs():
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
all_seq_groups
.
append
(
seq_group
)
all_seq_groups
.
append
(
seq_group
)
# Append 1 seq group
# Append 1 seq group
...
@@ -235,7 +247,12 @@ def test_scheduler_max_seqs():
...
@@ -235,7 +247,12 @@ def test_scheduler_max_seqs():
def
test_scheduler_delay_factor
():
def
test_scheduler_delay_factor
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -243,7 +260,8 @@ def test_scheduler_delay_factor():
...
@@ -243,7 +260,8 @@ def test_scheduler_delay_factor():
# schedule first prompt
# schedule first prompt
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
out
.
num_prefill_groups
>
0
assert
out
.
num_prefill_groups
>
0
...
@@ -253,7 +271,8 @@ def test_scheduler_delay_factor():
...
@@ -253,7 +271,8 @@ def test_scheduler_delay_factor():
# wait for a second before scheduling next prompt
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
time
.
sleep
(
1
)
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
# second prompt should *not* be scheduled
...
@@ -271,10 +290,17 @@ def test_scheduler_delay_factor():
...
@@ -271,10 +290,17 @@ def test_scheduler_delay_factor():
def
test_swapped_out_prioritized
():
def
test_swapped_out_prioritized
():
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
)
block_size
=
4
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
# best_of=2 * 3 == 6 sequences.
# best_of=2 * 3 == 6 sequences.
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# prefill scheduled now.
# prefill scheduled now.
...
@@ -298,7 +324,10 @@ def test_swapped_out_prioritized():
...
@@ -298,7 +324,10 @@ def test_swapped_out_prioritized():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
# Add 1 more task. Swap should be prioritized over prefill.
# Add 1 more task. Swap should be prioritized over prefill.
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
...
@@ -309,17 +338,25 @@ def test_swapped_out_prioritized():
...
@@ -309,17 +338,25 @@ def test_swapped_out_prioritized():
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
def
initialize_scheduler
(
*
,
def
initialize_scheduler
(
max_num_seqs
=
1000
,
*
,
max_token_budget
=
1000
,
max_num_seqs
=
1000
,
max_model_len
=
1000
,
max_token_budget
=
1000
,
lora_config
=
None
):
max_model_len
=
1000
,
block_size
=
4
lora_config
=
None
,
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
block_size
=
4
,
max_model_len
)
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
):
block_size
=
block_size
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
return
scheduler
return
scheduler
...
@@ -345,8 +382,11 @@ def test_prefill_schedule_max_prompt_len():
...
@@ -345,8 +382,11 @@ def test_prefill_schedule_max_prompt_len():
"""
"""
Test prompt longer than max_prompt_len is aborted.
Test prompt longer than max_prompt_len is aborted.
"""
"""
scheduler
=
initialize_scheduler
(
max_model_len
=
30
)
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
)
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -362,10 +402,15 @@ def test_prefill_schedule_token_budget():
...
@@ -362,10 +402,15 @@ def test_prefill_schedule_token_budget():
"""
"""
Test token budget respected.
Test token budget respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
0
)
budget
=
create_token_budget
(
token_budget
=
0
)
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# 0 token budget == nothing is scheduled.
# 0 token budget == nothing is scheduled.
...
@@ -388,10 +433,14 @@ def test_prefill_schedule_token_budget():
...
@@ -388,10 +433,14 @@ def test_prefill_schedule_token_budget():
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
# Test when current_batched_tokens respected.
# Test when current_batched_tokens respected.
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
budget
=
create_token_budget
(
token_budget
=
60
)
budget
=
create_token_budget
(
token_budget
=
60
)
add_token_budget
(
budget
,
30
,
0
)
add_token_budget
(
budget
,
30
,
0
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
# Cannot schedule a prompt that doesn't fit the budget.
# Cannot schedule a prompt that doesn't fit the budget.
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -415,10 +464,15 @@ def test_prefill_schedule_max_seqs():
...
@@ -415,10 +464,15 @@ def test_prefill_schedule_max_seqs():
"""
"""
Test max seq respected.
Test max seq respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -432,7 +486,9 @@ def test_prefill_schedule_max_seqs():
...
@@ -432,7 +486,9 @@ def test_prefill_schedule_max_seqs():
scheduler
.
waiting
=
deque
()
scheduler
.
waiting
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
add_token_budget
(
budget
,
0
,
2
)
add_token_budget
(
budget
,
0
,
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -447,13 +503,18 @@ def test_prefill_schedule_max_lora():
...
@@ -447,13 +503,18 @@ def test_prefill_schedule_max_lora():
"""
"""
Test max lora is respected and prioritized.
Test max lora is respected and prioritized.
"""
"""
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
120
)
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -465,7 +526,9 @@ def test_prefill_schedule_max_lora():
...
@@ -465,7 +526,9 @@ def test_prefill_schedule_max_lora():
# If a request is not scheduled because it hits max lora, it is
# If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that.
# prioritized. Verify that.
for
i
in
range
(
2
,
4
):
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# Schedule 2 requests (0 and 2)
# Schedule 2 requests (0 and 2)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
...
@@ -493,10 +556,15 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -493,10 +556,15 @@ def test_prefill_schedule_no_block_manager_capacity():
"""
"""
Test sequence cannot be scheduled due to block manager has no capacity.
Test sequence cannot be scheduled due to block manager has no capacity.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
...
@@ -511,7 +579,9 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -511,7 +579,9 @@ def test_prefill_schedule_no_block_manager_capacity():
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
()
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
...
@@ -528,10 +598,15 @@ def test_decode_schedule_preempted():
...
@@ -528,10 +598,15 @@ def test_decode_schedule_preempted():
"""
"""
Test decodes cannot be scheduled and preempted.
Test decodes cannot be scheduled and preempted.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
...
@@ -567,11 +642,17 @@ def test_decode_swap_beam_search():
...
@@ -567,11 +642,17 @@ def test_decode_swap_beam_search():
"""
"""
Test best_of > 1 swap out blocks
Test best_of > 1 swap out blocks
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_gpu_blocks
=
64
,
num_cpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -615,8 +696,14 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -615,8 +696,14 @@ def test_schedule_decode_blocks_to_copy_update():
"""
"""
Verify blocks_to_copy is updated.
Verify blocks_to_copy is updated.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
=
initialize_scheduler
(
block_size
=
4
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -642,12 +729,16 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -642,12 +729,16 @@ def test_schedule_decode_blocks_to_copy_update():
def
test_schedule_swapped_simple
():
def
test_schedule_swapped_simple
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
4
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
4
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
...
@@ -667,11 +758,14 @@ def test_schedule_swapped_simple():
...
@@ -667,11 +758,14 @@ def test_schedule_swapped_simple():
def
test_schedule_swapped_max_token_budget
():
def
test_schedule_swapped_max_token_budget
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
)
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -699,11 +793,16 @@ def test_schedule_swapped_max_token_budget():
...
@@ -699,11 +793,16 @@ def test_schedule_swapped_max_token_budget():
def
test_schedule_swapped_max_seqs
():
def
test_schedule_swapped_max_seqs
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
4
):
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
4
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -729,13 +828,18 @@ def test_schedule_swapped_max_seqs():
...
@@ -729,13 +828,18 @@ def test_schedule_swapped_max_seqs():
def
test_schedule_swapped_max_loras
():
def
test_schedule_swapped_max_loras
():
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -757,11 +861,17 @@ def test_schedule_swapped_max_loras():
...
@@ -757,11 +861,17 @@ def test_schedule_swapped_max_loras():
def
test_schedule_swapped_cannot_swap_in
():
def
test_schedule_swapped_cannot_swap_in
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -782,11 +892,17 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -782,11 +892,17 @@ def test_schedule_swapped_cannot_swap_in():
def
test_infeasible_swap
():
def
test_infeasible_swap
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -808,9 +924,15 @@ def test_infeasible_swap():
...
@@ -808,9 +924,15 @@ def test_infeasible_swap():
def
test_schedule_swapped_blocks_to_copy
():
def
test_schedule_swapped_blocks_to_copy
():
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment