Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
539aa992
Commit
539aa992
authored
Sep 27, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.2' into v0.6.2-dev
parents
93872128
7193774b
Changes
383
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
995 additions
and
423 deletions
+995
-423
tests/compile/utils.py
tests/compile/utils.py
+104
-0
tests/conftest.py
tests/conftest.py
+73
-38
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+143
-82
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+270
-93
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+8
-0
tests/encoder_decoder/__init__.py
tests/encoder_decoder/__init__.py
+0
-0
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+98
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+40
-1
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+35
-0
tests/entrypoints/openai/rpc/test_zmq_client.py
tests/entrypoints/openai/rpc/test_zmq_client.py
+0
-120
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+29
-31
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+1
-1
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+91
-0
tests/entrypoints/openai/test_lora_lineage.py
tests/entrypoints/openai/test_lora_lineage.py
+83
-0
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+4
-2
tests/entrypoints/openai/test_mp_api_server.py
tests/entrypoints/openai/test_mp_api_server.py
+0
-40
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+7
-4
tests/entrypoints/openai/test_serving_engine.py
tests/entrypoints/openai/test_serving_engine.py
+5
-4
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+1
-1
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+3
-6
No files found.
tests/compile/utils.py
0 → 100644
View file @
539aa992
import
os
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm.plugins
import
set_torch_compile_backend
from
vllm.utils
import
is_hip
TEST_MODELS_SMOKE
=
[
(
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
,
{
"quantization"
:
"compressed-tensors"
}),
(
"meta-llama/Meta-Llama-3-8B"
,
{}),
]
TEST_MODELS
=
[
(
"facebook/opt-125m"
,
{}),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"fp8"
}),
(
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
,
{
"quantization"
:
"compressed-tensors"
}),
(
"meta-llama/Meta-Llama-3-8B"
,
{}),
]
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"aqlm"
):
# noqa: SIM223
TEST_MODELS
.
append
((
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
,
{
"quantization"
:
"aqlm"
}))
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"gguf"
):
# noqa: SIM223
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
,
{
"quantization"
:
"gguf"
}))
if
is_quant_method_supported
(
"gptq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
{
"quantization"
:
"gptq"
}))
if
is_quant_method_supported
(
"gptq_marlin"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
{
"quantization"
:
"gptq_marlin"
}))
if
is_quant_method_supported
(
"gptq_marlin_24"
):
TEST_MODELS
.
append
((
"alexm-nm/tinyllama-24-marlin24-4bit-g128"
,
{
"quantization"
:
"gptq_marlin_24"
}))
if
is_quant_method_supported
(
"marlin"
):
TEST_MODELS
.
append
((
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
,
{
"quantization"
:
"marlin"
}))
if
not
is_hip
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
{
"quantization"
:
"AWQ"
}))
def
check_full_graph_support
(
model
,
model_kwargs
,
backend
,
tp_size
=
1
):
# make sure these models can be captured in full graph mode
if
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
not
in
os
.
environ
:
os
.
environ
[
"VLLM_TEST_DYNAMO_GRAPH_CAPTURE"
]
=
"1"
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
# Inductor doesn't support fp8/gptq_marlin_24 yet.
quantization
=
model_kwargs
.
get
(
"quantization"
)
if
(
quantization
==
"fp8"
or
quantization
==
"gptq_marlin"
or
quantization
==
"gptq_marlin_24"
)
and
backend
!=
"eager"
:
return
set_torch_compile_backend
(
backend
)
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
model
,
enforce_eager
=
True
,
tensor_parallel_size
=
tp_size
,
disable_custom_all_reduce
=
True
,
**
model_kwargs
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
tests/conftest.py
View file @
539aa992
...
@@ -20,6 +20,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
...
@@ -20,6 +20,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchFeature
)
BatchFeature
)
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
tests.models.utils
import
(
TokensTextLogprobs
,
TokensTextLogprobsPromptLogprobs
)
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
...
@@ -33,7 +35,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
...
@@ -33,7 +35,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
cuda_device_count_stateless
,
identity
,
is_cpu
)
identity
,
is_cpu
)
...
@@ -158,10 +159,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
...
@@ -158,10 +159,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
to initialize torch.
to initialize torch.
"""
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
not
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
)
return
False
return
True
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -171,6 +169,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
...
@@ -171,6 +169,12 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup
()
cleanup
()
@
pytest
.
fixture
(
autouse
=
True
)
def
dynamo_reset
():
yield
torch
.
_dynamo
.
reset
()
@
pytest
.
fixture
@
pytest
.
fixture
def
example_prompts
()
->
List
[
str
]:
def
example_prompts
()
->
List
[
str
]:
prompts
=
[]
prompts
=
[]
...
@@ -472,7 +476,7 @@ class HfRunner:
...
@@ -472,7 +476,7 @@ class HfRunner:
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
videos
:
Optional
[
List
[
np
.
ndarray
]]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
T
uple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]
]:
)
->
List
[
T
okensTextLogprobs
]:
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
all_output_strs
:
List
[
str
]
=
[]
...
@@ -528,7 +532,7 @@ class HfRunner:
...
@@ -528,7 +532,7 @@ class HfRunner:
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
List
[
T
uple
[
List
[
int
],
str
,
List
[
Dict
[
int
,
float
]]]
]:
)
->
List
[
T
okensTextLogprobs
]:
'''
'''
Greedy logprobs generation for vLLM encoder/decoder models
Greedy logprobs generation for vLLM encoder/decoder models
'''
'''
...
@@ -656,14 +660,16 @@ class VllmRunner:
...
@@ -656,14 +660,16 @@ class VllmRunner:
@
staticmethod
@
staticmethod
def
_final_steps_generate_w_logprobs
(
def
_final_steps_generate_w_logprobs
(
req_outputs
:
List
[
RequestOutput
],
req_outputs
:
List
[
RequestOutput
],
)
->
List
[
T
uple
[
List
[
int
],
str
,
Optional
[
Sample
Logprobs
]
]]
:
)
->
List
[
T
okensTextLogprobsPrompt
Logprobs
]:
outputs
:
List
[
T
uple
[
List
[
int
],
str
,
Optional
[
Sample
Logprobs
]
]]
=
[]
outputs
:
List
[
T
okensTextLogprobsPrompt
Logprobs
]
=
[]
for
req_output
in
req_outputs
:
for
req_output
in
req_outputs
:
assert
len
(
req_output
.
outputs
)
>
0
for
sample
in
req_output
.
outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
output_ids
=
list
(
sample
.
token_ids
)
output_logprobs
=
sample
.
logprobs
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
,
req_output
.
prompt_logprobs
))
return
outputs
return
outputs
def
generate_w_logprobs
(
def
generate_w_logprobs
(
...
@@ -673,9 +679,8 @@ class VllmRunner:
...
@@ -673,9 +679,8 @@ class VllmRunner:
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
Union
[
List
[
TokensTextLogprobs
],
assert
sampling_params
.
logprobs
is
not
None
List
[
TokensTextLogprobsPromptLogprobs
]]:
if
images
is
not
None
:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
...
@@ -698,13 +703,20 @@ class VllmRunner:
...
@@ -698,13 +703,20 @@ class VllmRunner:
req_outputs
=
self
.
model
.
generate
(
inputs
,
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
toks_str_logsprobs_prompt_logprobs
=
(
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
# Omit prompt logprobs if not required by sampling params
return
([
x
[
0
:
-
1
]
for
x
in
toks_str_logsprobs_prompt_logprobs
]
if
sampling_params
.
prompt_logprobs
is
None
else
toks_str_logsprobs_prompt_logprobs
)
def
generate_encoder_decoder_w_logprobs
(
def
generate_encoder_decoder_w_logprobs
(
self
,
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
Union
[
List
[
TokensTextLogprobs
],
List
[
TokensTextLogprobsPromptLogprobs
]]:
'''
'''
Logprobs generation for vLLM encoder/decoder models
Logprobs generation for vLLM encoder/decoder models
'''
'''
...
@@ -712,7 +724,12 @@ class VllmRunner:
...
@@ -712,7 +724,12 @@ class VllmRunner:
assert
sampling_params
.
logprobs
is
not
None
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
return
self
.
_final_steps_generate_w_logprobs
(
req_outputs
)
toks_str_logsprobs_prompt_logprobs
=
(
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
# Omit prompt logprobs if not required by sampling params
return
([
x
[
0
:
-
1
]
for
x
in
toks_str_logsprobs_prompt_logprobs
]
if
sampling_params
.
prompt_logprobs
is
None
else
toks_str_logsprobs_prompt_logprobs
)
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
...
@@ -730,44 +747,48 @@ class VllmRunner:
...
@@ -730,44 +747,48 @@ class VllmRunner:
prompts
:
List
[
str
],
prompts
:
List
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
)
->
Union
[
List
[
TokensTextLogprobs
],
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
List
[
TokensTextLogprobsPromptLogprobs
]]:
max_tokens
=
max_tokens
,
greedy_logprobs_params
=
SamplingParams
(
logprobs
=
num_logprobs
,
temperature
=
0.0
,
stop_token_ids
=
stop_token_ids
)
max_tokens
=
max_tokens
,
outputs
=
self
.
generate_w_logprobs
(
prompts
,
logprobs
=
num_logprobs
,
greedy_logprobs_params
,
prompt_logprobs
=
num_prompt_logprobs
,
images
=
images
,
stop_token_ids
=
stop_token_ids
)
audios
=
audios
,
videos
=
videos
)
return
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
,
return
[(
output_ids
,
output_str
,
output_logprobs
)
images
=
images
,
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
audios
=
audios
,
videos
=
videos
)
def
generate_encoder_decoder_greedy_logprobs
(
def
generate_encoder_decoder_greedy_logprobs
(
self
,
self
,
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
)
->
Union
[
List
[
TokensTextLogprobs
],
use_beam_search
=
False
,
List
[
TokensTextLogprobsPromptLogprobs
]]:
max_tokens
=
max_tokens
,
greedy_logprobs_params
=
SamplingParams
(
logprobs
=
num_logprobs
)
temperature
=
0.0
,
use_beam_search
=
False
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
,
prompt_logprobs
=
(
num_prompt_logprobs
),
)
'''
'''
Greedy logprobs generation for vLLM encoder/decoder models
Greedy logprobs generation for vLLM encoder/decoder models
'''
'''
outputs
=
self
.
generate_encoder_decoder_w_logprobs
(
return
self
.
generate_encoder_decoder_w_logprobs
(
encoder_decoder_prompts
,
greedy_logprobs_params
)
encoder_decoder_prompts
,
greedy_logprobs_params
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
...
@@ -781,6 +802,20 @@ class VllmRunner:
...
@@ -781,6 +802,20 @@ class VllmRunner:
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
return
outputs
def
generate_beam_search_new
(
self
,
prompts
:
Union
[
List
[
str
],
List
[
List
[
int
]]],
beam_width
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
List
[
int
]],
List
[
str
]]]:
outputs
=
self
.
model
.
beam_search
(
prompts
,
beam_width
,
max_tokens
)
returned_outputs
=
[]
for
output
in
outputs
:
token_ids
=
[
x
.
tokens
for
x
in
output
.
sequences
]
texts
=
[
x
.
text
for
x
in
output
.
sequences
]
returned_outputs
.
append
((
token_ids
,
texts
))
return
returned_outputs
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
def
encode
(
self
,
prompts
:
List
[
str
])
->
List
[
List
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
req_outputs
=
self
.
model
.
encode
(
prompts
)
outputs
=
[]
outputs
=
[]
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
539aa992
...
@@ -27,16 +27,19 @@ def schedule_and_update_computed_tokens(scheduler):
...
@@ -27,16 +27,19 @@ def schedule_and_update_computed_tokens(scheduler):
return
metas
,
out
return
metas
,
out
def
test_simple
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_simple
(
use_v2_block_manager
:
bool
):
"""Verify basic scheduling works."""
"""Verify basic scheduling works."""
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
num_seq_group
,
max_num_batched_tokens
,
max_model_len
,
num_seq_group
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -45,7 +48,9 @@ def test_simple():
...
@@ -45,7 +48,9 @@ def test_simple():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -69,30 +74,36 @@ def test_simple():
...
@@ -69,30 +74,36 @@ def test_simple():
assert
len
(
seq_group_meta
)
==
num_seq_group
assert
len
(
seq_group_meta
)
==
num_seq_group
def
test_chunk
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunk
(
use_v2_block_manager
:
bool
):
"""Verify prefills are chunked properly."""
"""Verify prefills are chunked properly."""
block_size
=
4
block_size
=
4
max_seqs
=
60
max_seqs
=
60
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
# Verify the second request is chunked.
# Verify the second request is chunked.
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
print
()
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
assert
seq_group_meta
[
0
].
token_chunk_size
==
60
# Verify it is chunked.
# Verify it is chunked.
...
@@ -113,24 +124,29 @@ def test_chunk():
...
@@ -113,24 +124,29 @@ def test_chunk():
assert
out
.
num_batched_tokens
==
57
assert
out
.
num_batched_tokens
==
57
def
test_complex
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_complex
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
60
max_seqs
=
60
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
64
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -151,7 +167,9 @@ def test_complex():
...
@@ -151,7 +167,9 @@ def test_complex():
# Add 2 more requests.
# Add 2 more requests.
for
i
in
range
(
2
,
4
):
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -176,16 +194,19 @@ def test_complex():
...
@@ -176,16 +194,19 @@ def test_complex():
assert
running
[
2
].
is_prefill
()
assert
running
[
2
].
is_prefill
()
def
test_maximal_decoding
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_maximal_decoding
(
use_v2_block_manager
:
bool
):
"""Verify decoding requests are prioritized."""
"""Verify decoding requests are prioritized."""
block_size
=
4
block_size
=
4
max_seqs
=
2
max_seqs
=
2
max_model_len
=
8
max_model_len
=
8
max_num_batched_tokens
=
2
max_num_batched_tokens
=
2
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -194,7 +215,9 @@ def test_maximal_decoding():
...
@@ -194,7 +215,9 @@ def test_maximal_decoding():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -211,7 +234,9 @@ def test_maximal_decoding():
...
@@ -211,7 +234,9 @@ def test_maximal_decoding():
append_new_token
(
running
[
0
],
1
)
append_new_token
(
running
[
0
],
1
)
# Create one more seq_group.
# Create one more seq_group.
_
,
seq_group
=
create_dummy_prompt
(
"3"
,
prompt_length
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"3"
,
prompt_length
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -263,23 +288,28 @@ def test_maximal_decoding():
...
@@ -263,23 +288,28 @@ def test_maximal_decoding():
assert
out
.
num_batched_tokens
==
2
assert
out
.
num_batched_tokens
==
2
def
test_prompt_limit
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prompt_limit
(
use_v2_block_manager
:
bool
):
"""Verify max_num_batched_tokens < max_model_len is possible."""
"""Verify max_num_batched_tokens < max_model_len is possible."""
block_size
=
4
block_size
=
4
max_seqs
=
32
max_seqs
=
32
max_model_len
=
64
max_model_len
=
64
max_num_batched_tokens
=
32
max_num_batched_tokens
=
32
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -293,7 +323,8 @@ def test_prompt_limit():
...
@@ -293,7 +323,8 @@ def test_prompt_limit():
assert
out
.
num_batched_tokens
==
32
assert
out
.
num_batched_tokens
==
32
def
test_prompt_limit_exceed
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prompt_limit_exceed
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
64
max_seqs
=
64
max_model_len
=
32
max_model_len
=
32
...
@@ -303,12 +334,13 @@ def test_prompt_limit_exceed():
...
@@ -303,12 +334,13 @@ def test_prompt_limit_exceed():
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
48
)
prompt_length
=
48
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
assert
seq_group
.
is_prefill
()
assert
seq_group
.
is_prefill
()
...
@@ -317,22 +349,28 @@ def test_prompt_limit_exceed():
...
@@ -317,22 +349,28 @@ def test_prompt_limit_exceed():
assert
out
.
ignored_seq_groups
[
0
]
==
seq_group
assert
out
.
ignored_seq_groups
[
0
]
==
seq_group
def
test_swap
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_swap
(
use_v2_block_manager
:
bool
):
"""Verify swapping works with chunked prefill requests"""
"""Verify swapping works with chunked prefill requests"""
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -369,21 +407,27 @@ def test_swap():
...
@@ -369,21 +407,27 @@ def test_swap():
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
def
test_running_prefill_prioritized_over_swap
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_running_prefill_prioritized_over_swap
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -413,7 +457,9 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -413,7 +457,9 @@ def test_running_prefill_prioritized_over_swap():
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group2
)
scheduler
.
add_seq_group
(
seq_group2
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
len
(
out
.
scheduled_seq_groups
)
==
1
...
@@ -455,22 +501,27 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -455,22 +501,27 @@ def test_running_prefill_prioritized_over_swap():
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
def
test_chunked_prefill_preempt
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunked_prefill_preempt
(
use_v2_block_manager
:
bool
):
"""Verify preempt works with chunked prefill requests"""
"""Verify preempt works with chunked prefill requests"""
block_size
=
4
block_size
=
4
max_seqs
=
30
max_seqs
=
30
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# The request is chunked.
# The request is chunked.
...
@@ -517,22 +568,27 @@ def test_chunked_prefill_preempt():
...
@@ -517,22 +568,27 @@ def test_chunked_prefill_preempt():
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
assert
out
.
num_batched_tokens
==
max_num_batched_tokens
def
test_chunked_prefill_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_chunked_prefill_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_seqs
=
2
max_seqs
=
2
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
12
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
12
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
# The first prefill is chunked.
# The first prefill is chunked.
...
@@ -542,7 +598,9 @@ def test_chunked_prefill_max_seqs():
...
@@ -542,7 +598,9 @@ def test_chunked_prefill_max_seqs():
# Add new requests.
# Add new requests.
for
i
in
range
(
4
):
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
65
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
65
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -564,16 +622,19 @@ def test_chunked_prefill_max_seqs():
...
@@ -564,16 +622,19 @@ def test_chunked_prefill_max_seqs():
assert
not
running
[
1
].
is_prefill
()
assert
not
running
[
1
].
is_prefill
()
def
test_perfix_caching
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_perfix_caching
(
use_v2_block_manager
:
bool
):
"""Verify allocating full blocks when prefix caching is enabled."""
"""Verify allocating full blocks when prefix caching is enabled."""
block_size
=
4
block_size
=
4
max_seqs
=
10
max_seqs
=
10
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
max_seqs
,
max_num_batched_tokens
,
max_model_len
,
max_seqs
,
enable_chunked_prefill
=
True
)
max_model_len
,
enable_chunked_prefill
=
True
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1.0
,
1
,
1
,
...
...
tests/core/test_scheduler.py
View file @
539aa992
...
@@ -3,7 +3,8 @@ from collections import deque
...
@@ -3,7 +3,8 @@ from collections import deque
from
typing
import
List
,
Set
,
Tuple
from
typing
import
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
from
torch
import
Use
# noqa
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
...
@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
...
@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens
)
schedule_and_update_computed_tokens
)
def
test_scheduler_add_seq_group
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_add_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
...
@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
# Add seq group to scheduler.
# Add seq group to scheduler.
num_seq_group
=
4
num_seq_group
=
4
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
def
test_scheduler_abort_seq_group
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_abort_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
...
@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
def
test_scheduler_schedule_simple
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_simple
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
...
@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
...
@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
def
test_scheduler_prefill_prioritized
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_prefill_prioritized
(
use_v2_block_manager
:
bool
):
"""Verify running batched tokens are not applied to prefill requests."""
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
block_size
=
4
max_model_len
=
30
max_model_len
=
30
max_batched_num_tokens
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
2
,
scheduler_config
=
SchedulerConfig
(
max_model_len
)
max_batched_num_tokens
,
2
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
)
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
# Schedule seq groups prompts.
# Schedule seq groups prompts.
...
@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
...
@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
# Add a new prefill request B.
# Add a new prefill request B.
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
)
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Verify prefill requests are prioritized. Since max_batched_num_tokens
# Verify prefill requests are prioritized. Since max_batched_num_tokens
...
@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
...
@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
def
test_scheduler_schedule_preempt_abort
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_preempt_abort
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
block_size
)
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
)
block_size
,
block_size
=
block_size
)
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
...
@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
def
test_scheduler_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_seq_group
=
2
max_seq_group
=
2
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
...
@@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
all_seq_groups
.
append
(
seq_group
)
all_seq_groups
.
append
(
seq_group
)
# Append 1 seq group
# Append 1 seq group
...
@@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
...
@@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
def
test_scheduler_delay_factor
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_delay_factor
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
...
@@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
# schedule first prompt
# schedule first prompt
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
out
.
num_prefill_groups
>
0
assert
out
.
num_prefill_groups
>
0
...
@@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
...
@@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
# wait for a second before scheduling next prompt
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
time
.
sleep
(
1
)
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
# second prompt should *not* be scheduled
...
@@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
...
@@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
def
test_swapped_out_prioritized
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
)
def
test_swapped_out_prioritized
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
,
block_size
=
block_size
,
use_v2_block_manager
=
use_v2_block_manager
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
# best_of=2 * 3 == 6 sequences.
# best_of=2 * 3 == 6 sequences.
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# prefill scheduled now.
# prefill scheduled now.
...
@@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
...
@@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
# Add 1 more task. Swap should be prioritized over prefill.
# Add 1 more task. Swap should be prioritized over prefill.
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
...
@@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
...
@@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
def
initialize_scheduler
(
*
,
def
initialize_scheduler
(
max_num_seqs
=
1000
,
*
,
max_token_budget
=
1000
,
max_num_seqs
=
1000
,
max_model_len
=
1000
,
max_token_budget
=
1000
,
lora_config
=
None
):
max_model_len
=
1000
,
block_size
=
4
lora_config
=
None
,
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
use_v2_block_manager
=
False
,
max_model_len
)
block_size
=
4
,
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
):
block_size
=
block_size
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
return
scheduler
return
scheduler
...
@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
...
@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
def
test_prefill_schedule_max_prompt_len
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_prompt_len
(
use_v2_block_manager
:
bool
):
"""
"""
Test prompt longer than max_prompt_len is aborted.
Test prompt longer than max_prompt_len is aborted.
"""
"""
scheduler
=
initialize_scheduler
(
max_model_len
=
30
)
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
)
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
...
@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_token_budget
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_token_budget
(
use_v2_block_manager
:
bool
):
"""
"""
Test token budget respected.
Test token budget respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
0
)
budget
=
create_token_budget
(
token_budget
=
0
)
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# 0 token budget == nothing is scheduled.
# 0 token budget == nothing is scheduled.
...
@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
...
@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
# Test when current_batched_tokens respected.
# Test when current_batched_tokens respected.
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
budget
=
create_token_budget
(
token_budget
=
60
)
budget
=
create_token_budget
(
token_budget
=
60
)
add_token_budget
(
budget
,
30
,
0
)
add_token_budget
(
budget
,
30
,
0
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
# Cannot schedule a prompt that doesn't fit the budget.
# Cannot schedule a prompt that doesn't fit the budget.
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
...
@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_seqs
(
use_v2_block_manager
:
bool
):
"""
"""
Test max seq respected.
Test max seq respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
...
@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
scheduler
.
waiting
=
deque
()
scheduler
.
waiting
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
add_token_budget
(
budget
,
0
,
2
)
add_token_budget
(
budget
,
0
,
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
...
@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
def
test_prefill_schedule_max_lora
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_lora
(
use_v2_block_manager
:
bool
):
"""
"""
Test max lora is respected and prioritized.
Test max lora is respected and prioritized.
"""
"""
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
120
)
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
...
@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
# If a request is not scheduled because it hits max lora, it is
# If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that.
# prioritized. Verify that.
for
i
in
range
(
2
,
4
):
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# Schedule 2 requests (0 and 2)
# Schedule 2 requests (0 and 2)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
...
@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
...
@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
assert
budget
.
num_batched_tokens
==
60
assert
budget
.
num_batched_tokens
==
60
def
test_prefill_schedule_no_block_manager_capacity
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_no_block_manager_capacity
(
use_v2_block_manager
):
"""
"""
Test sequence cannot be scheduled due to block manager has no capacity.
Test sequence cannot be scheduled due to block manager has no capacity.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
...
@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
()
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
...
@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_decode_schedule_preempted
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_schedule_preempted
(
use_v2_block_manager
:
bool
):
"""
"""
Test decodes cannot be scheduled and preempted.
Test decodes cannot be scheduled and preempted.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
...
@@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
...
@@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
def
test_decode_swap_beam_search
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_swap_beam_search
(
use_v2_block_manager
:
bool
):
"""
"""
Test best_of > 1 swap out blocks
Test best_of > 1 swap out blocks
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_gpu_blocks
=
64
,
num_cpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
...
@@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
def
test_schedule_decode_blocks_to_copy_update
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_decode_blocks_to_copy_update
(
use_v2_block_manager
:
bool
):
"""
"""
Verify blocks_to_copy is updated.
Verify blocks_to_copy is updated.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
4
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -619,13 +747,19 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -619,13 +747,19 @@ def test_schedule_decode_blocks_to_copy_update():
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
def
test_schedule_swapped_simple
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_simple
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
4
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
4
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
...
@@ -644,12 +778,17 @@ def test_schedule_swapped_simple():
...
@@ -644,12 +778,17 @@ def test_schedule_swapped_simple():
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
def
test_schedule_swapped_max_token_budget
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_max_token_budget
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
)
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -676,12 +815,19 @@ def test_schedule_swapped_max_token_budget():
...
@@ -676,12 +815,19 @@ def test_schedule_swapped_max_token_budget():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
4
):
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
4
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -706,14 +852,21 @@ def test_schedule_swapped_max_seqs():
...
@@ -706,14 +852,21 @@ def test_schedule_swapped_max_seqs():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_loras
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_max_loras
(
use_v2_block_manager
:
bool
):
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -734,12 +887,20 @@ def test_schedule_swapped_max_loras():
...
@@ -734,12 +887,20 @@ def test_schedule_swapped_max_loras():
assert
len
(
curr_loras
)
==
1
assert
len
(
curr_loras
)
==
1
def
test_schedule_swapped_cannot_swap_in
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_cannot_swap_in
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -759,12 +920,20 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -759,12 +920,20 @@ def test_schedule_swapped_cannot_swap_in():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_infeasible_swap
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_infeasible_swap
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -785,10 +954,18 @@ def test_infeasible_swap():
...
@@ -785,10 +954,18 @@ def test_infeasible_swap():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_blocks_to_copy
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_blocks_to_copy
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
...
...
tests/distributed/test_pipeline_parallel.py
View file @
539aa992
...
@@ -8,6 +8,8 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
...
@@ -8,6 +8,8 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import
os
import
os
import
pytest
import
pytest
from
packaging
import
version
from
transformers
import
__version__
as
transformers_version
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -37,6 +39,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
...
@@ -37,6 +39,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-1B"
,
"mp"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-1B"
,
"mp"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-2B"
,
"mp"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-2B"
,
"mp"
),
(
1
,
2
,
1
,
0
,
1
,
"OpenGVLab/InternVL2-4B"
,
"mp"
),
(
1
,
2
,
1
,
0
,
1
,
"OpenGVLab/InternVL2-4B"
,
"mp"
),
(
1
,
2
,
0
,
1
,
0
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"mp"
)
],
],
)
)
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
...
@@ -46,6 +49,11 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
...
@@ -46,6 +49,11 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
"multiprocessing distributed backend"
)
# Skip tests that require transformers>=4.45.0
if
"Qwen2-VL"
in
MODEL_NAME
and
version
.
parse
(
transformers_version
)
<
version
.
parse
(
"4.45.0.dev0"
):
pytest
.
skip
(
"This test requires transformers>=4.45.0"
)
pp_args
=
[
pp_args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
...
vllm/model_executor/layers/ops
/__init__.py
→
tests/encoder_decoder
/__init__.py
View file @
539aa992
File moved
tests/encoder_decoder/test_e2e_correctness.py
0 → 100644
View file @
539aa992
"""E2E tests to verify the correctness of the encoder-decoder framework
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"</s>"
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
hf_output_str
=
"<s>"
+
hf_output_str
return
output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
is_cpu
(),
reason
=
"CPU backend is not currently supported with encoder/decoder models"
)
def
test_encoder_decoder_e2e
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
decoder_prompt_type
:
DecoderPromptType
,
enforce_eager
:
bool
,
)
->
None
:
'''
End-to-End (E2E) test for the encoder-decoder framework.
This test evaluates the encoder-decoder functionality using the BART
model. We compare the outputs of the Hugging Face and vLLM
implementations to ensure that both implementations produce consistent
and correct results.
'''
test_case_prompts
=
example_encoder_decoder_prompts
[
decoder_prompt_type
]
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_case_prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_case_prompts
,
max_tokens
,
num_logprobs
)
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
tests/engine/test_arg_utils.py
View file @
539aa992
from
argparse
import
ArgumentTypeError
import
pytest
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
,
nullable_kvs
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
...
@@ -13,6 +15,10 @@ from vllm.utils import FlexibleArgumentParser
...
@@ -13,6 +15,10 @@ from vllm.utils import FlexibleArgumentParser
"image"
:
16
,
"image"
:
16
,
"video"
:
2
"video"
:
2
}),
}),
(
"Image=16, Video=2"
,
{
"image"
:
16
,
"video"
:
2
}),
])
])
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
...
@@ -22,3 +28,36 @@ def test_limit_mm_per_prompt_parser(arg, expected):
...
@@ -22,3 +28,36 @@ def test_limit_mm_per_prompt_parser(arg, expected):
args
=
parser
.
parse_args
([
"--limit-mm-per-prompt"
,
arg
])
args
=
parser
.
parse_args
([
"--limit-mm-per-prompt"
,
arg
])
assert
args
.
limit_mm_per_prompt
==
expected
assert
args
.
limit_mm_per_prompt
==
expected
@
pytest
.
mark
.
parametrize
(
(
"arg"
),
[
"image"
,
# Missing =
"image=4,image=5"
,
# Conflicting values
"image=video=4"
# Too many = in tokenized arg
])
def
test_bad_nullable_kvs
(
arg
):
with
pytest
.
raises
(
ArgumentTypeError
):
nullable_kvs
(
arg
)
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
(
None
,
None
),
(
"{}"
,
{}),
(
'{"num_crops": 4}'
,
{
"num_crops"
:
4
}),
(
'{"foo": {"bar": "baz"}}'
,
{
"foo"
:
{
"bar"
:
"baz"
}
}),
])
def
test_mm_processor_kwargs_prompt_parser
(
arg
,
expected
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
else
:
args
=
parser
.
parse_args
([
"--mm-processor-kwargs"
,
arg
])
assert
args
.
mm_processor_kwargs
==
expected
tests/entrypoints/llm/test_generate.py
View file @
539aa992
...
@@ -162,6 +162,41 @@ def test_chat():
...
@@ -162,6 +162,41 @@ def test_chat():
assert
len
(
outputs
)
==
1
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
...
...
tests/entrypoints/openai/rpc/test_zmq_client.py
deleted
100644 → 0
View file @
93872128
import
asyncio
import
tempfile
import
unittest
import
unittest.mock
import
uuid
import
pytest
import
pytest_asyncio
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.rpc.client
import
(
AsyncEngineRPCClient
,
RPCClientClosedError
)
from
vllm.entrypoints.openai.rpc.server
import
AsyncEngineRPCServer
@
pytest
.
fixture
(
scope
=
"function"
)
def
tmp_socket
():
with
tempfile
.
TemporaryDirectory
()
as
td
:
yield
f
"ipc://
{
td
}
/
{
uuid
.
uuid4
()
}
"
@
pytest_asyncio
.
fixture
(
scope
=
"function"
)
async
def
dummy_server
(
tmp_socket
,
monkeypatch
):
dummy_engine
=
unittest
.
mock
.
AsyncMock
()
def
dummy_engine_builder
(
*
args
,
**
kwargs
):
return
dummy_engine
with
monkeypatch
.
context
()
as
m
:
m
.
setattr
(
AsyncLLMEngine
,
"from_engine_args"
,
dummy_engine_builder
)
server
=
AsyncEngineRPCServer
(
None
,
None
,
rpc_path
=
tmp_socket
)
loop
=
asyncio
.
get_running_loop
()
server_task
=
loop
.
create_task
(
server
.
run_server_loop
())
try
:
yield
server
finally
:
server_task
.
cancel
()
server
.
cleanup
()
@
pytest_asyncio
.
fixture
(
scope
=
"function"
)
async
def
client
(
tmp_socket
):
client
=
AsyncEngineRPCClient
(
rpc_path
=
tmp_socket
)
# Sanity check: the server is connected
await
client
.
_wait_for_server_rpc
()
try
:
yield
client
finally
:
client
.
close
()
@
pytest
.
mark
.
asyncio
async
def
test_client_data_methods_use_timeouts
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Make the server _not_ reply with a model config
m
.
setattr
(
dummy_server
,
"get_config"
,
lambda
x
:
None
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
# And ensure the task completes anyway
# (client.setup() invokes server.get_config())
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
setup
())
with
pytest
.
raises
(
TimeoutError
,
match
=
"Server didn't reply within"
):
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_aborts_use_timeouts
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Hang all abort requests
m
.
setattr
(
dummy_server
,
"abort"
,
lambda
x
:
None
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
# The client should suppress timeouts on `abort`s
# and return normally, assuming the server will eventually
# abort the request.
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
abort
(
"test request id"
))
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_data_methods_reraise_exceptions
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
with
monkeypatch
.
context
()
as
m
:
# Make the server raise some random exception
exception
=
RuntimeError
(
"Client test exception"
)
def
raiser
():
raise
exception
m
.
setattr
(
dummy_server
.
engine
,
"get_model_config"
,
raiser
)
m
.
setattr
(
client
,
"_data_timeout"
,
10
)
client_task
=
asyncio
.
get_running_loop
().
create_task
(
client
.
setup
())
# And ensure the task completes, raising the exception
with
pytest
.
raises
(
RuntimeError
,
match
=
str
(
exception
)):
await
asyncio
.
wait_for
(
client_task
,
timeout
=
0.05
)
@
pytest
.
mark
.
asyncio
async
def
test_client_errors_after_closing
(
monkeypatch
,
dummy_server
,
client
:
AsyncEngineRPCClient
):
client
.
close
()
# Healthchecks and generate requests will fail with explicit errors
with
pytest
.
raises
(
RPCClientClosedError
):
await
client
.
check_health
()
with
pytest
.
raises
(
RPCClientClosedError
):
async
for
_
in
client
.
generate
(
None
,
None
,
None
):
pass
# But no-ops like aborting will pass
await
client
.
abort
(
"test-request-id"
)
await
client
.
do_log_stats
()
tests/entrypoints/openai/test_accuracy.py
View file @
539aa992
...
@@ -18,38 +18,36 @@ TASK = "gsm8k"
...
@@ -18,38 +18,36 @@ TASK = "gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
EXPECTED_VALUE
=
0.58
DEFAULT_ARGS
=
[
"--max-model-len"
,
"4096"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
[
"--enable-chunked-prefill"
],
# Chunked
[
"--num-scheduler-steps"
,
"8"
],
# MS
[
"--num-scheduler-steps"
,
"8"
,
"--multi-step-stream-outputs"
]
# MS+Stream
]
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
server
():
def
test_lm_eval_accuracy
(
more_args
):
args
=
[
args
=
list
(
DEFAULT_ARGS
)
"--max-model-len"
,
"4096"
,
"--enable-chunked-prefill"
,
args
.
extend
(
more_args
)
"--disable-log-requests"
,
"--enforce-eager"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_data
(
server
):
return
{
"url"
:
f
"
{
server
.
url_for
(
'v1'
)
}
/completions"
,
}
print
(
f
"Running with:
{
args
}
"
)
def
test_lm_eval_accuracy
(
server_data
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
model_args
=
(
f
"model=
{
MODEL_NAME
}
,"
url
=
f
"
{
remote_server
.
url_for
(
'v1'
)
}
/completions"
f
"base_url=
{
server_data
[
'url'
]
}
,"
f
"num_concurrent=
{
NUM_CONCURRENT
}
,tokenized_requests=False"
)
model_args
=
(
f
"model=
{
MODEL_NAME
}
,"
results
=
lm_eval
.
simple_evaluate
(
f
"base_url=
{
url
}
,"
model
=
"local-completions"
,
f
"num_concurrent=
{
NUM_CONCURRENT
}
,tokenized_requests=False"
)
model_args
=
model_args
,
tasks
=
TASK
,
results
=
lm_eval
.
simple_evaluate
(
)
model
=
"local-completions"
,
model_args
=
model_args
,
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
tasks
=
TASK
,
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
)
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
tests/
async_engine
/test_chat_template.py
→
tests/
entrypoints/openai
/test_chat_template.py
View file @
539aa992
...
@@ -5,7 +5,7 @@ from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
...
@@ -5,7 +5,7 @@ from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
..utils
import
VLLM_PATH
from
..
.
utils
import
VLLM_PATH
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
assert
chatml_jinja_path
.
exists
()
...
...
tests/entrypoints/openai/test_cli_args.py
0 → 100644
View file @
539aa992
import
json
import
unittest
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.utils
import
FlexibleArgumentParser
LORA_MODULE
=
{
"name"
:
"module2"
,
"path"
:
"/path/to/module2"
,
"base_model_name"
:
"llama"
}
class
TestLoraParserAction
(
unittest
.
TestCase
):
def
setUp
(
self
):
# Setting up argparse parser for tests
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
self
.
parser
=
make_arg_parser
(
parser
)
def
test_valid_key_value_format
(
self
):
# Test old format: name=path
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_valid_json_format
(
self
):
# Test valid JSON format input
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_invalid_json_format
(
self
):
# Test invalid JSON format input, missing closing brace
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
def
test_invalid_type_error
(
self
):
# Test type error when values are not JSON or key=value
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'invalid_format'
# This is not JSON or key=value format
])
def
test_invalid_json_field
(
self
):
# Test valid JSON format but missing required fields
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module4"}'
# Missing required 'path' field
])
def
test_empty_values
(
self
):
# Test when no LoRA modules are provided
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
''
])
self
.
assertEqual
(
args
.
lora_modules
,
[])
def
test_multiple_valid_inputs
(
self
):
# Test multiple valid inputs (both old and JSON format)
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/entrypoints/openai/test_lora_lineage.py
0 → 100644
View file @
539aa992
import
json
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_with_lora_modules_json
(
zephyr_lora_files
):
# Define the json format LoRA module configurations
lora_module_1
=
{
"name"
:
"zephyr-lora"
,
"path"
:
zephyr_lora_files
,
"base_model_name"
:
MODEL_NAME
}
lora_module_2
=
{
"name"
:
"zephyr-lora2"
,
"path"
:
zephyr_lora_files
,
"base_model_name"
:
MODEL_NAME
}
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
json
.
dumps
(
lora_module_1
),
json
.
dumps
(
lora_module_2
),
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"64"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client_for_lora_lineage
(
server_with_lora_modules_json
):
async
with
server_with_lora_modules_json
.
get_async_client
(
)
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_check_lora_lineage
(
client_for_lora_lineage
:
openai
.
AsyncOpenAI
,
zephyr_lora_files
):
models
=
await
client_for_lora_lineage
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
served_model
.
root
==
MODEL_NAME
assert
served_model
.
parent
is
None
assert
all
(
lora_model
.
root
==
zephyr_lora_files
for
lora_model
in
lora_models
)
assert
all
(
lora_model
.
parent
==
MODEL_NAME
for
lora_model
in
lora_models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
tests/entrypoints/openai/test_models.py
View file @
539aa992
...
@@ -51,12 +51,14 @@ async def client(server):
...
@@ -51,12 +51,14 @@ async def client(server):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
,
zephyr_lora_files
):
models
=
await
client
.
models
.
list
()
models
=
await
client
.
models
.
list
()
models
=
models
.
data
models
=
models
.
data
served_model
=
models
[
0
]
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
served_model
.
root
==
MODEL_NAME
assert
all
(
lora_model
.
root
==
zephyr_lora_files
for
lora_model
in
lora_models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
tests/entrypoints/openai/test_mp_api_server.py
deleted
100644 → 0
View file @
93872128
import
time
import
pytest
from
vllm.entrypoints.openai.api_server
import
build_async_engine_client
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
asyncio
async
def
test_mp_crash_detection
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
# use an invalid tensor_parallel_size to trigger the
# error in the server
args
.
tensor_parallel_size
=
65536
start
=
time
.
perf_counter
()
async
with
build_async_engine_client
(
args
):
pass
end
=
time
.
perf_counter
()
assert
end
-
start
<
60
,
(
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup."
)
@
pytest
.
mark
.
asyncio
async
def
test_mp_cuda_init
():
# it should not crash, when cuda is initialized
# in the API server process
import
torch
torch
.
cuda
.
init
()
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
async
with
build_async_engine_client
(
args
):
pass
tests/entrypoints/openai/test_serving_chat.py
View file @
539aa992
...
@@ -4,13 +4,15 @@ from dataclasses import dataclass
...
@@ -4,13 +4,15 @@ from dataclasses import dataclass
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
from
vllm.engine.
async_llm_engine
import
Async
LLMEngine
from
vllm.engine.
multiprocessing.client
import
MQ
LLMEngine
Client
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_engine
import
BaseModelPath
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME
=
"openai-community/gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
@
dataclass
@
dataclass
...
@@ -37,7 +39,7 @@ async def _async_serving_chat_init():
...
@@ -37,7 +39,7 @@ async def _async_serving_chat_init():
serving_completion
=
OpenAIServingChat
(
engine
,
serving_completion
=
OpenAIServingChat
(
engine
,
model_config
,
model_config
,
served_model_names
=
[
MODEL_
NAME
]
,
BASE_
MODEL_
PATHS
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
lora_modules
=
None
,
lora_modules
=
None
,
...
@@ -52,12 +54,13 @@ def test_async_serving_chat_init():
...
@@ -52,12 +54,13 @@ def test_async_serving_chat_init():
def
test_serving_chat_should_set_correct_max_tokens
():
def
test_serving_chat_should_set_correct_max_tokens
():
mock_engine
=
MagicMock
(
spec
=
Async
LLMEngine
)
mock_engine
=
MagicMock
(
spec
=
MQ
LLMEngine
Client
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
serving_chat
=
OpenAIServingChat
(
mock_engine
,
serving_chat
=
OpenAIServingChat
(
mock_engine
,
MockModelConfig
(),
MockModelConfig
(),
served_model_names
=
[
MODEL_
NAME
]
,
BASE_
MODEL_
PATHS
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
lora_modules
=
None
,
lora_modules
=
None
,
...
...
tests/entrypoints/openai/test_serving_engine.py
View file @
539aa992
...
@@ -4,13 +4,14 @@ from unittest.mock import MagicMock
...
@@ -4,13 +4,14 @@ from unittest.mock import MagicMock
import
pytest
import
pytest
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
Async
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
LoadLoraAdapterRequest
,
LoadLoraAdapterRequest
,
UnloadLoraAdapterRequest
)
UnloadLoraAdapterRequest
)
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_engine
import
BaseModelPath
,
OpenAIServing
MODEL_NAME
=
"meta-llama/Llama-2-7b"
MODEL_NAME
=
"meta-llama/Llama-2-7b"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
LORA_LOADING_SUCCESS_MESSAGE
=
(
LORA_LOADING_SUCCESS_MESSAGE
=
(
"Success: LoRA adapter '{lora_name}' added successfully."
)
"Success: LoRA adapter '{lora_name}' added successfully."
)
LORA_UNLOADING_SUCCESS_MESSAGE
=
(
LORA_UNLOADING_SUCCESS_MESSAGE
=
(
...
@@ -18,14 +19,14 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
...
@@ -18,14 +19,14 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
async
def
_async_serving_engine_init
():
async
def
_async_serving_engine_init
():
mock_engine_client
=
MagicMock
(
spec
=
Async
EngineClient
)
mock_engine_client
=
MagicMock
(
spec
=
EngineClient
)
mock_model_config
=
MagicMock
(
spec
=
ModelConfig
)
mock_model_config
=
MagicMock
(
spec
=
ModelConfig
)
# Set the max_model_len attribute to avoid missing attribute
# Set the max_model_len attribute to avoid missing attribute
mock_model_config
.
max_model_len
=
2048
mock_model_config
.
max_model_len
=
2048
serving_engine
=
OpenAIServing
(
mock_engine_client
,
serving_engine
=
OpenAIServing
(
mock_engine_client
,
mock_model_config
,
mock_model_config
,
served_model_names
=
[
MODEL_
NAME
]
,
BASE_
MODEL_
PATHS
,
lora_modules
=
None
,
lora_modules
=
None
,
prompt_adapters
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
request_logger
=
None
)
...
...
tests/entrypoints/openai/test_shutdown.py
View file @
539aa992
...
@@ -44,5 +44,5 @@ async def test_shutdown_on_engine_failure(tmp_path):
...
@@ -44,5 +44,5 @@ async def test_shutdown_on_engine_failure(tmp_path):
prompt
=
"Hello, my name is"
)
prompt
=
"Hello, my name is"
)
# Now the server should shut down
# Now the server should shut down
return_code
=
remote_server
.
proc
.
wait
(
timeout
=
3
)
return_code
=
remote_server
.
proc
.
wait
(
timeout
=
8
)
assert
return_code
is
not
None
assert
return_code
is
not
None
tests/kernels/test_activation.py
View file @
539aa992
...
@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
...
@@ -7,6 +7,7 @@ from tests.kernels.utils import opcheck
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
GeluAndMul
,
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
GeluAndMul
,
NewGELU
,
QuickGELU
,
NewGELU
,
QuickGELU
,
SiluAndMul
)
SiluAndMul
)
from
vllm.utils
import
seed_everything
from
.allclose_default
import
get_default_atol
,
get_default_rtol
from
.allclose_default
import
get_default_atol
,
get_default_rtol
...
@@ -34,9 +35,7 @@ def test_act_and_mul(
...
@@ -34,9 +35,7 @@ def test_act_and_mul(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
seed_everything
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
if
activation
==
"silu"
:
if
activation
==
"silu"
:
...
@@ -77,9 +76,7 @@ def test_activation(
...
@@ -77,9 +76,7 @@ def test_activation(
seed
:
int
,
seed
:
int
,
device
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
seed_everything
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
[
0
]()
layer
=
activation
[
0
]()
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment