Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
af7f4372
Commit
af7f4372
authored
Sep 03, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1
parents
5e19cdef
09c77926
Changes
448
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
695 additions
and
117 deletions
+695
-117
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+62
-0
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+42
-0
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+80
-0
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+4
-26
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+99
-0
tests/core/test_serialization.py
tests/core/test_serialization.py
+33
-0
tests/core/utils.py
tests/core/utils.py
+69
-38
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+2
-1
tests/distributed/test_basic_distributed_correctness_enc_dec.py
...distributed/test_basic_distributed_correctness_enc_dec.py
+102
-0
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+7
-0
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+9
-9
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+4
-4
tests/distributed/test_distributed_oot.py
tests/distributed/test_distributed_oot.py
+6
-0
tests/distributed/test_multi_node_assignment.py
tests/distributed/test_multi_node_assignment.py
+64
-0
tests/distributed/test_multimodal_broadcast.py
tests/distributed/test_multimodal_broadcast.py
+4
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+26
-39
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+30
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+24
-0
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+19
-0
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+9
-0
No files found.
Too many changes to show.
To preserve performance only
448 of 448+
files are displayed.
Plain diff
Email patch
tests/core/block/test_block_manager_v2.py
View file @
af7f4372
...
...
@@ -311,6 +311,68 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
3
,
8
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
True
,
False
])
def
test_can_swap
(
block_size
,
num_gpu_blocks
,
num_lookahead_slots
,
enable_caching
):
""" Verify the block manager can correctly determine if a sequence group
can be swapped in/out.
"""
num_cpu_blocks
=
num_gpu_blocks
block_manager
=
BlockSpaceManagerV2
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
(
num_gpu_blocks
-
1
)
*
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
prompt
.
status
=
SequenceStatus
.
RUNNING
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# At this moment, we still have enough free blocks to swap in the seq group.
if
num_lookahead_slots
<=
block_size
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
OK
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
# During Swapped out, 2 cached blocks were evicted from the GPU,
# so the prompt1 can't be swapped in
prompt2_len
=
2
*
block_size
-
1
prompt2
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
prompt2_len
,
prompt_tokens
=
[
10000
+
i
for
i
in
range
(
prompt2_len
)])
prompt2
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group2
)
# Swap seq group from CPU -> GPU.
if
num_lookahead_slots
<=
block_size
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
LATER
else
:
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
==
AllocStatus
.
NEVER
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
...
...
tests/core/block/test_naive_block.py
View file @
af7f4372
...
...
@@ -100,3 +100,45 @@ class TestNaiveBlockAllocator:
for
i
,
block
in
enumerate
(
blocks
):
assert
allocator
.
get_num_free_blocks
()
==
i
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_naive_block_get_num_blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
blocks touched, with different lookahead slots.
"""
allocator_src
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocator_dst
=
NaiveBlockAllocator
(
create_block
=
NaiveBlock
,
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create a chain of cacheable blocks in the dst
allocate_block
=
TestNaiveBlockAllocator
.
create_allocate_lambda
(
"immutable"
,
allocator_src
,
prev_block
=
None
,
token_ids
=
list
(
range
(
block_size
)))
src_blocks
=
[
allocate_block
()
for
_
in
range
(
num_blocks
-
1
)]
# All blocks are cached
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
)
==
num_blocks
-
1
# Insert one non-full block in the src
allocate_non_full_block
=
\
TestNaiveBlockAllocator
.
create_allocate_lambda
(
"mutable"
,
allocator_src
,
prev_block
=
src_blocks
[
-
1
],
token_ids
=
[]
)
src_blocks
.
append
(
allocate_non_full_block
())
src_blocks
[
-
1
].
append_token_ids
([
0
])
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
1
)
==
num_blocks
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
block_size
-
1
)
==
num_blocks
assert
allocator_dst
.
get_num_blocks_touched
(
src_blocks
,
num_lookahead_slots
=
block_size
)
==
(
num_blocks
+
1
)
tests/core/block/test_prefix_caching_block.py
View file @
af7f4372
...
...
@@ -315,6 +315,60 @@ class TestPrefixCachingBlockAllocator:
i
)
allocator
.
free
(
block
)
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
def
test_prefix_caching_block_get_num_blocks_touched
(
num_blocks
,
block_size
):
""" Verify the allocator can correctly return the number of
blocks touched, when there are cached prefixes and different
lookahead slots.
"""
allocator_src
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
allocator_dst
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
# Create token ids that will exhaust all blocks except the last
token_ids
=
list
(
range
((
num_blocks
-
1
)
*
block_size
))
# Create a chain of cacheable blocks in the dst
cached_blocks
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator_dst
,
)
# Create a chain of the same blocks in the src
blocks_to_swap_in
=
\
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator_src
,
)
# All blocks are cached
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
)
==
0
# Free the first block in the dst
allocator_dst
.
free
(
cached_blocks
[
0
])
# Now the first block becomes dangling, the swapped blocks need
# to reclaim the first block in the dst
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
)
==
1
# Insert one non-full block in the src
non_full_block
=
allocator_src
.
allocate_mutable_block
(
blocks_to_swap_in
[
-
1
])
non_full_block
.
append_token_ids
([
0
])
blocks_to_swap_in
.
append
(
non_full_block
)
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
1
)
==
2
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
block_size
-
1
)
==
2
assert
allocator_dst
.
get_num_blocks_touched
(
blocks_to_swap_in
,
num_lookahead_slots
=
block_size
)
==
3
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
1024
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
...
...
@@ -628,6 +682,32 @@ class TestPrefixCachingBlockAllocator:
assert
new_block
[
0
].
block_id
==
last_block_id
# Test case for cache mertics
@
staticmethod
def
test_metric
():
block_size
=
16
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
4
,
block_size
=
block_size
)
# Test when no query (0/0)
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.0
token_ids
=
list
(
range
(
block_size
))
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
# Test 0/1 hit rate
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.0
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
# Test 1/2 hit rate
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.5
# Test more than one block
for
_
in
range
(
2
,
1005
):
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
assert
allocator
.
get_prefix_cache_hit_rate
()
>
0.99
@
staticmethod
def
create_immutable_chain
(
block_size
:
int
,
...
...
tests/core/test_scheduler.py
View file @
af7f4372
...
...
@@ -9,33 +9,11 @@ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
Logprob
,
SequenceGroup
,
SequenceStatus
from
vllm.sequence
import
SequenceGroup
,
SequenceStatus
from
.utils
import
create_dummy_prompt
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
out
,
token_id
:
int
):
seq_groups
=
get_sequence_groups
(
out
)
for
seq_group
in
seq_groups
:
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
append_new_token_seq_group
(
token_chunk_size
,
seq_group
,
token_id
:
int
):
seq_group
.
update_num_computed_tokens
(
token_chunk_size
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
from
.utils
import
(
append_new_token
,
append_new_token_seq_group
,
create_dummy_prompt
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
def
test_scheduler_add_seq_group
():
...
...
tests/core/test_scheduler_encoder_decoder.py
0 → 100644
View file @
af7f4372
from
typing
import
List
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
SequenceGroup
from
.utils
import
(
append_new_token
,
create_dummy_prompt_encoder_decoder
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
def
test_scheduler_schedule_simple_encoder_decoder
():
'''
Test basic scheduler functionality in the context
of an encoder/decoder model. Focus on testing
enc/dec-specific functionality sense tests already
exist for decoder-only functionality
Test behavior:
* Construct Scheduler
* Construct dummy encoder/decoder sequence groups
* Add dummy seq groups to scheduler backlog
* Schedule the next seq group & validate:
* Cross-attn block tables
* Updated states of seq groups
* Number of batched tokens
* Number of blocks to copy/swap-in/swap-out
* Number of scheduled seq groups
* Repeat for both prefill- and decode-phase
* Abort scheduled seq groups
* Assert that aborted seq groups no longer appear in
cross-attention block table
'''
block_size
=
4
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
req_id_list
=
[]
for
i
in
range
(
num_seq_group
):
req_id
=
str
(
i
)
req_id_list
.
append
(
req_id
)
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
req_id
,
block_size
,
block_size
,
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Schedule seq groups prefill.
num_tokens
=
block_size
*
num_seq_group
seq_group_meta_list
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# - Verify that sequence group cross-attention block tables are
# registered with the block manager
assert
all
([(
req_id
in
scheduler
.
block_manager
.
cross_block_tables
)
for
req_id
in
req_id_list
])
# - Validate sequence-group status
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# - Validate number of batched tokens
assert
out
.
num_batched_tokens
==
num_tokens
# - Validate there are no remaining blocks to swap
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
# - Validate all seq groups were scheduled
assert
len
(
seq_group_meta_list
)
==
num_seq_group
append_new_token
(
out
,
1
)
# Schedule seq groups decode.
seq_group_meta_list
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# - Verify that sequence group metadata includes encoder attention
# and cross-attention metadata
assert
all
([
not
((
seq_group_meta
.
encoder_seq_data
is
None
)
or
(
seq_group_meta
.
cross_block_table
is
None
))
for
seq_group_meta
in
seq_group_meta_list
])
# - Validate sequence-group status
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# - Validate there is one batched token per seq group
assert
out
.
num_batched_tokens
==
num_seq_group
# - Validate there are no remaining blocks to swap
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
# - Validate that all seq groups were scheduled
assert
len
(
seq_group_meta_list
)
==
num_seq_group
append_new_token
(
out
,
1
)
# Abort sequences
for
req_id
in
req_id_list
:
scheduler
.
abort_seq_group
(
req_id
)
# - Verify that sequence group cross-attention block tables are
# NO LONGER registered with the block manager
assert
req_id
not
in
scheduler
.
block_manager
.
cross_block_tables
tests/core/test_serialization.py
0 → 100644
View file @
af7f4372
import
msgspec
from
vllm.executor.msgspec_utils
import
decode_hook
,
encode_hook
from
vllm.sequence
import
ExecuteModelRequest
from
..spec_decode.utils
import
create_batch
def
test_msgspec_serialization
():
num_lookahead_slots
=
4
seq_group_metadata_list
,
_
,
_
=
create_batch
(
16
,
num_lookahead_slots
)
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
,
num_lookahead_slots
=
num_lookahead_slots
,
running_queue_size
=
4
)
encoder
=
msgspec
.
msgpack
.
Encoder
(
enc_hook
=
encode_hook
)
decoder
=
msgspec
.
msgpack
.
Decoder
(
ExecuteModelRequest
,
dec_hook
=
decode_hook
)
req
=
decoder
.
decode
(
encoder
.
encode
(
execute_model_req
))
expected
=
execute_model_req
.
seq_group_metadata_list
actual
=
req
.
seq_group_metadata_list
assert
(
len
(
expected
)
==
len
(
actual
))
expected
=
expected
[
0
]
actual
=
actual
[
0
]
assert
expected
.
block_tables
==
actual
.
block_tables
assert
expected
.
is_prompt
==
actual
.
is_prompt
assert
expected
.
request_id
==
actual
.
request_id
assert
(
expected
.
seq_data
[
0
].
prompt_token_ids
==
actual
.
seq_data
[
0
].
prompt_token_ids
)
assert
(
expected
.
seq_data
[
0
].
output_token_ids
==
actual
.
seq_data
[
0
].
output_token_ids
)
tests/core/utils.py
View file @
af7f4372
...
...
@@ -15,13 +15,15 @@ def create_dummy_prompt(
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
prompt_tokens
:
Optional
[
List
[
int
]]
=
None
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
if
prompt_tokens
is
None
:
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
...
...
@@ -53,27 +55,30 @@ def create_dummy_prompt_encoder_decoder(
block_size
=
decoder_prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
# and prompt "0 ... block_size". Note that the prompt string
# doesn't actually match the tokens
decoder_prompt_tokens
=
list
(
range
(
decoder_prompt_length
))
decoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
decoder_prompt_tokens
])
encoder_prompt_tokens
=
list
(
reversed
(
list
(
range
(
encoder_prompt_length
))))
encoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
encoder_prompt_tokens
])
inputs
=
{
"prompt"
:
decoder_prompt_str
,
"prompt_token_ids"
:
decoder_prompt_tokens
,
"encoder_prompt"
:
encoder_prompt_str
,
"encoder_prompt_token_ids"
:
encoder_prompt_tokens
,
"multi_modal_data"
:
None
,
}
decoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
decoder_prompt_str
,
"prompt_token_ids"
:
decoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
inputs
=
inputs
,
block_size
=
block_size
,
from_decoder_prompt
=
True
)
encoder_prompt_tokens
=
list
(
reversed
(
list
(
range
(
encoder_prompt_length
))))
encoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
encoder_prompt_tokens
])
encoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
encoder_prompt_str
,
"prompt_token_ids"
:
encoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
inputs
=
inputs
,
block_size
=
block_size
,
from_decoder_prompt
=
False
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
[
decoder_prompt
],
sampling_params
=
SamplingParams
(
...
...
@@ -139,17 +144,21 @@ def create_seq_group_encoder_decoder(
prompt_token_ids
=
[
0
]
*
seq_prompt_len
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"encoder_prompt"
:
""
,
"encoder_prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
}
seqs
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
# Construct decoder input sequences
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
inputs
=
inputs
,
block_size
=
16
,
from_decoder_prompt
=
True
)
for
i
in
range
(
output_len
):
seq
.
append_token_id
(
...
...
@@ -158,16 +167,11 @@ def create_seq_group_encoder_decoder(
)
seqs
.
append
(
seq
)
# Encoder sequence
encoder_seq
=
Sequence
(
seq_id
=
seq_id_start
+
len
(
seq_output_lens
),
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
# Encoder input sequence
encoder_seq
=
Sequence
(
seq_id
=
seq_id_start
+
len
(
seq_output_lens
),
inputs
=
inputs
,
block_size
=
16
,
from_decoder_prompt
=
False
)
return
SequenceGroup
(
request_id
=
request_id
,
seqs
=
seqs
,
...
...
@@ -177,4 +181,31 @@ def create_seq_group_encoder_decoder(
def
round_up_to_next_block
(
seq_len
:
int
,
block_size
:
int
)
->
int
:
return
(
seq_len
+
block_size
-
1
)
//
block_size
\ No newline at end of file
return
(
seq_len
+
block_size
-
1
)
//
block_size
# Helper functions for scheduler tests
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
out
,
token_id
:
int
):
seq_groups
=
get_sequence_groups
(
out
)
for
seq_group
in
seq_groups
:
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
append_new_token_seq_group
(
token_chunk_size
,
seq_group
,
token_id
:
int
):
seq_group
.
update_num_computed_tokens
(
token_chunk_size
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
tests/distributed/test_basic_distributed_correctness.py
View file @
af7f4372
...
...
@@ -22,7 +22,8 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, test_suite"
,
[
"model, distributed_executor_backend, attention_backend, "
"test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
...
...
tests/distributed/test_basic_distributed_correctness_enc_dec.py
0 → 100644
View file @
af7f4372
"""For encoder/decoder models only:
Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness_enc_dec.py
```
"""
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.utils
import
cuda_device_count_stateless
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"facebook/bart-large-cnn"
,
"ray"
),
(
"facebook/bart-large-cnn"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
model
:
str
,
distributed_executor_backend
:
str
,
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
)
->
None
:
'''
Test vLLM BART inference on more than one GPU, comparing
outputs against HF as a baseline.
Fork a new process for each test, to prevent CUDA from
being re-initialized by successive tests within the same
process.
Arguments:
* model: the HF ID of the specific BART variant under test
* distributed_executor_backend
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
'''
dtype
=
"float"
max_tokens
=
64
num_logprobs
=
5
# Example inputs with non-trivial (i.e. not None/empty) encoder &
# decoder prompts.
test_prompts
=
example_encoder_decoder_prompts
[
DecoderPromptType
.
CUSTOM
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_chunked_prefill_distributed.py
View file @
af7f4372
...
...
@@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
```
"""
import
os
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
...
...
@@ -30,6 +32,11 @@ def test_models(
model
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
:
# noqa
assert
distributed_executor_backend
==
"ray"
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
dtype
=
"half"
max_tokens
=
5
...
...
tests/distributed/test_comm_ops.py
View file @
af7f4372
...
...
@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_reduce
(
t
)
assert
torch
.
all
close
(
t
,
expected
)
torch
.
testing
.
assert_
close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
assert
torch
.
all
close
(
t
,
expected
)
torch
.
testing
.
assert_
close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
else
:
recv_dict
=
broadcast_tensor_dict
(
src
=
0
)
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
all
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
all
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
assert
torch
.
all
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
if
not
get_pp_group
().
is_first_rank
:
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
all
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
all
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
assert
torch
.
all
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
get_pp_group
().
send
(
test_tensor
)
if
not
get_pp_group
().
is_first_rank
:
assert
torch
.
all
close
(
test_tensor
,
recv_tensor
)
torch
.
testing
.
assert_
close
(
test_tensor
,
recv_tensor
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
...
...
tests/distributed/test_custom_all_reduce.py
View file @
af7f4372
...
...
@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
out2
=
tensor_model_parallel_all_reduce
(
inp2
)
dist
.
all_reduce
(
inp2
,
group
=
group
)
graph
.
replay
()
assert
torch
.
all
close
(
out1
,
inp1
)
assert
torch
.
all
close
(
out2
,
inp2
)
torch
.
testing
.
assert_
close
(
out1
,
inp1
)
torch
.
testing
.
assert_
close
(
out2
,
inp2
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
out
=
inp
for
_
in
range
(
num_communication
):
out
=
fa
.
all_reduce_unreg
(
out
)
assert
torch
.
all
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
torch
.
testing
.
assert_
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
inp
=
torch
.
ones
(
sz
*
4
,
dtype
=
torch
.
bfloat16
,
device
=
device
)
out
=
inp
for
_
in
range
(
num_communication
):
out
=
fa
.
all_reduce_unreg
(
out
)
assert
torch
.
all
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
torch
.
testing
.
assert_
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
...
...
tests/distributed/test_distributed_oot.py
0 → 100644
View file @
af7f4372
from
..entrypoints.openai.test_oot_registration
import
(
run_and_test_dummy_opt_api_server
)
def
test_distributed_oot
(
dummy_opt_path
:
str
):
run_and_test_dummy_opt_api_server
(
dummy_opt_path
,
tp
=
2
)
tests/distributed/test_multi_node_assignment.py
0 → 100644
View file @
af7f4372
"""Make sure ray assigns GPU workers to the correct node.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_multi_node_assignment.py
```
"""
import
os
import
pytest
import
ray
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
initialize_ray_cluster
from
vllm.config
import
ParallelConfig
from
vllm.executor.ray_utils
import
_wait_until_pg_removed
from
vllm.utils
import
get_ip
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
mark
.
skipif
(
not
VLLM_MULTI_NODE
,
reason
=
"Need at least 2 nodes to run the test."
)
def
test_multi_node_assignment
()
->
None
:
# NOTE: important to keep this class definition here
# to let ray use cloudpickle to serialize it.
class
Actor
:
def
get_ip
(
self
):
return
get_ip
()
for
_
in
range
(
10
):
config
=
ParallelConfig
(
1
,
2
)
initialize_ray_cluster
(
config
)
current_ip
=
get_ip
()
workers
=
[]
for
bundle_id
,
bundle
in
enumerate
(
config
.
placement_group
.
bundle_specs
):
if
not
bundle
.
get
(
"GPU"
,
0
):
continue
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
config
.
placement_group
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_id
,
)
worker
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
1
,
scheduling_strategy
=
scheduling_strategy
,
)(
Actor
).
remote
()
worker_ip
=
ray
.
get
(
worker
.
get_ip
.
remote
())
assert
worker_ip
==
current_ip
workers
.
append
(
worker
)
for
worker
in
workers
:
ray
.
kill
(
worker
)
_wait_until_pg_removed
(
config
.
placement_group
)
tests/distributed/test_multimodal_broadcast.py
View file @
af7f4372
...
...
@@ -18,8 +18,10 @@ from ..utils import fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"llava-hf/llava-1.5-7b-hf"
,
"ray"
),
(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"ray"
),
(
"facebook/chameleon-7b"
,
"ray"
),
(
"llava-hf/llava-1.5-7b-hf"
,
"mp"
),
(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"mp"
),
(
"facebook/chameleon-7b"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
:
str
,
...
...
@@ -34,6 +36,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
from
..models.test_llava
import
models
,
run_test
elif
model
.
startswith
(
"llava-hf/llava-v1.6"
):
from
..models.test_llava_next
import
models
,
run_test
elif
model
.
startswith
(
"facebook/chameleon"
):
from
..models.test_chameleon
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
...
...
tests/distributed/test_pipeline_parallel.py
View file @
af7f4372
...
...
@@ -9,34 +9,36 @@ import os
import
pytest
from
vllm.logger
import
init_logger
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
logger
=
init_logger
(
"test_pipeline_parallel"
)
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
mark
.
parametrize
((
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
"MODEL_NAME, DIST_BACKEND"
),
[
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
])
@
fork_new_process_for_each_test
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
,
DIST_BACKEND
):
if
VLLM_MULTI_NODE
and
DIST_BACKEND
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
USE_RAY_ADAG_NCCL
=
0
USE_RAY_ADAG
=
0
pp_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
...
...
@@ -70,39 +72,24 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
pp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
pp_env
=
None
if
USE_RAY_ADAG
:
assert
DIST_BACKEND
==
"ray"
,
(
"
Ray ADAG
is only supported with Ray distributed backend"
)
if
(
DIST_BACKEND
==
"ray"
and
TP_SIZE
==
2
and
PP_SIZE
==
2
and
CHUNKED_PREFILL
):
# Test
Ray ADAG
for a subset of the tests
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
str
(
int
(
USE_RAY_ADAG_NCCL
)),
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
pp_args
.
append
(
"--disable-frontend-multiprocessing"
)
tp_args
.
append
(
"--disable-frontend-multiprocessing"
)
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
,
pp_env
)
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
(
2
,
"JackFram/llama-160m"
),
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
@
fork_new_process_for_each_test
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
try
:
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
,
pp_env
)
except
Exception
:
if
pp_env
is
None
:
raise
else
:
# Ray ADAG tests are flaky, so we don't want to fail the test
logger
.
exception
(
"Ray ADAG tests failed"
)
tests/distributed/test_pp_cudagraph.py
0 → 100644
View file @
af7f4372
import
os
import
pytest
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
(
2
,
"JackFram/llama-160m"
),
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
@
fork_new_process_for_each_test
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/engine/test_arg_utils.py
0 → 100644
View file @
af7f4372
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
(
None
,
None
),
(
"image=16"
,
{
"image"
:
16
}),
(
"image=16,video=2"
,
{
"image"
:
16
,
"video"
:
2
}),
])
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
else
:
args
=
parser
.
parse_args
([
"--limit-mm-per-prompt"
,
arg
])
assert
args
.
limit_mm_per_prompt
==
expected
tests/entrypoints/llm/test_generate.py
View file @
af7f4372
...
...
@@ -140,3 +140,22 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
tests/entrypoints/llm/test_prompt_validation.py
0 → 100644
View file @
af7f4372
import
pytest
from
vllm
import
LLM
def
test_empty_prompt
():
llm
=
LLM
(
model
=
"gpt2"
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
Prev
1
…
4
5
6
7
8
9
10
11
12
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment