Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad385667
Commit
ad385667
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.3.post1-dev'
parents
be0967c1
903593d3
Changes
364
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
958 additions
and
338 deletions
+958
-338
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+99
-0
tests/core/test_serialization.py
tests/core/test_serialization.py
+33
-0
tests/core/utils.py
tests/core/utils.py
+75
-45
tests/data/test_config.yaml
tests/data/test_config.yaml
+3
-0
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+0
-79
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+0
-68
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+9
-9
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+4
-4
tests/distributed/test_distributed_oot.py
tests/distributed/test_distributed_oot.py
+6
-0
tests/distributed/test_multi_node_assignment.py
tests/distributed/test_multi_node_assignment.py
+64
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+334
-65
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pp_cudagraph.py
+30
-0
tests/distributed/test_same_node.py
tests/distributed/test_same_node.py
+7
-7
tests/encoder_decoder/__init__.py
tests/encoder_decoder/__init__.py
+0
-0
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+98
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+83
-0
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+4
-4
tests/engine/test_multiproc_workers.py
tests/engine/test_multiproc_workers.py
+3
-3
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+3
-2
tests/engine/test_stop_strings.py
tests/engine/test_stop_strings.py
+103
-52
No files found.
Too many changes to show.
To preserve performance only
364 of 364+
files are displayed.
Plain diff
Email patch
tests/core/test_scheduler_encoder_decoder.py
0 → 100644
View file @
ad385667
from
typing
import
List
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
SequenceGroup
from
.utils
import
(
append_new_token
,
create_dummy_prompt_encoder_decoder
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
def
test_scheduler_schedule_simple_encoder_decoder
():
'''
Test basic scheduler functionality in the context
of an encoder/decoder model. Focus on testing
enc/dec-specific functionality sense tests already
exist for decoder-only functionality
Test behavior:
* Construct Scheduler
* Construct dummy encoder/decoder sequence groups
* Add dummy seq groups to scheduler backlog
* Schedule the next seq group & validate:
* Cross-attn block tables
* Updated states of seq groups
* Number of batched tokens
* Number of blocks to copy/swap-in/swap-out
* Number of scheduled seq groups
* Repeat for both prefill- and decode-phase
* Abort scheduled seq groups
* Assert that aborted seq groups no longer appear in
cross-attention block table
'''
block_size
=
4
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
req_id_list
=
[]
for
i
in
range
(
num_seq_group
):
req_id
=
str
(
i
)
req_id_list
.
append
(
req_id
)
_
,
_
,
seq_group
=
create_dummy_prompt_encoder_decoder
(
req_id
,
block_size
,
block_size
,
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
# Schedule seq groups prefill.
num_tokens
=
block_size
*
num_seq_group
seq_group_meta_list
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# - Verify that sequence group cross-attention block tables are
# registered with the block manager
assert
all
([(
req_id
in
scheduler
.
block_manager
.
cross_block_tables
)
for
req_id
in
req_id_list
])
# - Validate sequence-group status
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# - Validate number of batched tokens
assert
out
.
num_batched_tokens
==
num_tokens
# - Validate there are no remaining blocks to swap
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
# - Validate all seq groups were scheduled
assert
len
(
seq_group_meta_list
)
==
num_seq_group
append_new_token
(
out
,
1
)
# Schedule seq groups decode.
seq_group_meta_list
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# - Verify that sequence group metadata includes encoder attention
# and cross-attention metadata
assert
all
([
not
((
seq_group_meta
.
encoder_seq_data
is
None
)
or
(
seq_group_meta
.
cross_block_table
is
None
))
for
seq_group_meta
in
seq_group_meta_list
])
# - Validate sequence-group status
assert
set
(
get_sequence_groups
(
out
))
==
set
(
running
)
# - Validate there is one batched token per seq group
assert
out
.
num_batched_tokens
==
num_seq_group
# - Validate there are no remaining blocks to swap
assert
(
not
out
.
blocks_to_copy
and
not
out
.
blocks_to_swap_in
and
not
out
.
blocks_to_swap_out
)
# - Validate that all seq groups were scheduled
assert
len
(
seq_group_meta_list
)
==
num_seq_group
append_new_token
(
out
,
1
)
# Abort sequences
for
req_id
in
req_id_list
:
scheduler
.
abort_seq_group
(
req_id
)
# - Verify that sequence group cross-attention block tables are
# NO LONGER registered with the block manager
assert
req_id
not
in
scheduler
.
block_manager
.
cross_block_tables
tests/core/test_serialization.py
0 → 100644
View file @
ad385667
import
msgspec
from
vllm.executor.msgspec_utils
import
decode_hook
,
encode_hook
from
vllm.sequence
import
ExecuteModelRequest
from
..spec_decode.utils
import
create_batch
def
test_msgspec_serialization
():
num_lookahead_slots
=
4
seq_group_metadata_list
,
_
,
_
=
create_batch
(
16
,
num_lookahead_slots
)
execute_model_req
=
ExecuteModelRequest
(
seq_group_metadata_list
=
seq_group_metadata_list
,
num_lookahead_slots
=
num_lookahead_slots
,
running_queue_size
=
4
)
encoder
=
msgspec
.
msgpack
.
Encoder
(
enc_hook
=
encode_hook
)
decoder
=
msgspec
.
msgpack
.
Decoder
(
ExecuteModelRequest
,
dec_hook
=
decode_hook
)
req
=
decoder
.
decode
(
encoder
.
encode
(
execute_model_req
))
expected
=
execute_model_req
.
seq_group_metadata_list
actual
=
req
.
seq_group_metadata_list
assert
(
len
(
expected
)
==
len
(
actual
))
expected
=
expected
[
0
]
actual
=
actual
[
0
]
assert
expected
.
block_tables
==
actual
.
block_tables
assert
expected
.
is_prompt
==
actual
.
is_prompt
assert
expected
.
request_id
==
actual
.
request_id
assert
(
expected
.
seq_data
[
0
].
prompt_token_ids
==
actual
.
seq_data
[
0
].
prompt_token_ids
)
assert
(
expected
.
seq_data
[
0
].
output_token_ids
==
actual
.
seq_data
[
0
].
output_token_ids
)
tests/core/utils.py
View file @
ad385667
...
...
@@ -13,15 +13,18 @@ def create_dummy_prompt(
prompt_length
:
int
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
prompt_tokens
:
Optional
[
List
[
int
]]
=
None
,
min_tokens
:
int
=
0
,
max_tokens
:
int
=
16
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
if
prompt_tokens
is
None
:
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
prompt_tokens
=
list
(
range
(
prompt_length
))
prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
prompt_tokens
])
prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
...
...
@@ -33,8 +36,9 @@ def create_dummy_prompt(
seqs
=
[
prompt
],
arrival_time
=
time
.
time
(),
sampling_params
=
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
best_of
=
best_of
,
max_tokens
=
max_tokens
,
min_tokens
=
min_tokens
),
lora_request
=
lora_request
)
return
prompt
,
seq_group
...
...
@@ -46,39 +50,39 @@ def create_dummy_prompt_encoder_decoder(
encoder_prompt_length
:
int
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
)
->
Tuple
[
Sequence
,
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
decoder_prompt_length
# Create dummy prompt sequence with tokens 0...block_size-1
# and prompt "0 ... block_size".
# and prompt "0 ... block_size". Note that the prompt string
# doesn't actually match the tokens
decoder_prompt_tokens
=
list
(
range
(
decoder_prompt_length
))
decoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
decoder_prompt_tokens
])
encoder_prompt_tokens
=
list
(
reversed
(
list
(
range
(
encoder_prompt_length
))))
encoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
encoder_prompt_tokens
])
inputs
=
{
"prompt"
:
decoder_prompt_str
,
"prompt_token_ids"
:
decoder_prompt_tokens
,
"encoder_prompt"
:
encoder_prompt_str
,
"encoder_prompt_token_ids"
:
encoder_prompt_tokens
,
"multi_modal_data"
:
None
,
}
decoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
decoder_prompt_str
,
"prompt_token_ids"
:
decoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
inputs
=
inputs
,
block_size
=
block_size
,
from_decoder_prompt
=
True
)
encoder_prompt_tokens
=
list
(
reversed
(
list
(
range
(
encoder_prompt_length
))))
encoder_prompt_str
=
" "
.
join
([
str
(
t
)
for
t
in
encoder_prompt_tokens
])
encoder_prompt
=
Sequence
(
int
(
request_id
),
inputs
=
{
"prompt"
:
encoder_prompt_str
,
"prompt_token_ids"
:
encoder_prompt_tokens
,
"multi_modal_data"
:
None
,
},
block_size
=
block_size
)
inputs
=
inputs
,
block_size
=
block_size
,
from_decoder_prompt
=
False
)
seq_group
=
SequenceGroup
(
request_id
=
request_id
,
seqs
=
[
decoder_prompt
],
sampling_params
=
SamplingParams
(
use_beam_search
=
use_beam_search
,
best_of
=
best_of
),
sampling_params
=
SamplingParams
(
best_of
=
best_of
),
arrival_time
=
time
.
time
(),
lora_request
=
lora_request
,
encoder_seq
=
encoder_prompt
)
...
...
@@ -139,17 +143,21 @@ def create_seq_group_encoder_decoder(
prompt_token_ids
=
[
0
]
*
seq_prompt_len
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"encoder_prompt"
:
""
,
"encoder_prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
}
seqs
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
# Construct decoder input sequences
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
inputs
=
inputs
,
block_size
=
16
,
from_decoder_prompt
=
True
)
for
i
in
range
(
output_len
):
seq
.
append_token_id
(
...
...
@@ -158,16 +166,11 @@ def create_seq_group_encoder_decoder(
)
seqs
.
append
(
seq
)
# Encoder sequence
encoder_seq
=
Sequence
(
seq_id
=
seq_id_start
+
len
(
seq_output_lens
),
inputs
=
{
"prompt"
:
""
,
"prompt_token_ids"
:
prompt_token_ids
,
"multi_modal_data"
:
None
,
},
block_size
=
16
,
)
# Encoder input sequence
encoder_seq
=
Sequence
(
seq_id
=
seq_id_start
+
len
(
seq_output_lens
),
inputs
=
inputs
,
block_size
=
16
,
from_decoder_prompt
=
False
)
return
SequenceGroup
(
request_id
=
request_id
,
seqs
=
seqs
,
...
...
@@ -177,4 +180,31 @@ def create_seq_group_encoder_decoder(
def
round_up_to_next_block
(
seq_len
:
int
,
block_size
:
int
)
->
int
:
return
(
seq_len
+
block_size
-
1
)
//
block_size
\ No newline at end of file
return
(
seq_len
+
block_size
-
1
)
//
block_size
# Helper functions for scheduler tests
def
get_sequence_groups
(
scheduler_output
):
return
[
s
.
seq_group
for
s
in
scheduler_output
.
scheduled_seq_groups
]
def
append_new_token
(
out
,
token_id
:
int
):
seq_groups
=
get_sequence_groups
(
out
)
for
seq_group
in
seq_groups
:
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
def
schedule_and_update_computed_tokens
(
scheduler
):
metas
,
out
,
_
=
scheduler
.
schedule
()
for
s
,
meta
in
zip
(
out
.
scheduled_seq_groups
,
metas
):
s
.
seq_group
.
update_num_computed_tokens
(
meta
.
token_chunk_size
)
return
metas
,
out
def
append_new_token_seq_group
(
token_chunk_size
,
seq_group
,
token_id
:
int
):
seq_group
.
update_num_computed_tokens
(
token_chunk_size
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
token_id
)})
tests/data/test_config.yaml
0 → 100644
View file @
ad385667
port
:
12312
served_model_name
:
mymodel
tensor_parallel_size
:
2
tests/distributed/test_basic_distributed_correctness.py
deleted
100644 → 0
View file @
be0967c1
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness.py
```
"""
import
os
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
)
->
None
:
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
if
attention_backend
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attention_backend
dtype
=
"half"
max_tokens
=
5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_chunked_prefill_distributed.py
deleted
100644 → 0
View file @
be0967c1
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
pytest test_chunked_prefill_distributed.py
```
"""
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"facebook/opt-125m"
,
"ray"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
),
(
"facebook/opt-125m"
,
"mp"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_comm_ops.py
View file @
ad385667
...
...
@@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_reduce
(
t
)
assert
torch
.
all
close
(
t
,
expected
)
torch
.
testing
.
assert_
close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
assert
torch
.
all
close
(
t
,
expected
)
torch
.
testing
.
assert_
close
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
else
:
recv_dict
=
broadcast_tensor_dict
(
src
=
0
)
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
all
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
all
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
assert
torch
.
all
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
if
not
get_pp_group
().
is_first_rank
:
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
all
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
all
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
assert
torch
.
all
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
torch
.
testing
.
assert_
close
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
get_pp_group
().
send
(
test_tensor
)
if
not
get_pp_group
().
is_first_rank
:
assert
torch
.
all
close
(
test_tensor
,
recv_tensor
)
torch
.
testing
.
assert_
close
(
test_tensor
,
recv_tensor
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
...
...
tests/distributed/test_custom_all_reduce.py
View file @
ad385667
...
...
@@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
out2
=
tensor_model_parallel_all_reduce
(
inp2
)
dist
.
all_reduce
(
inp2
,
group
=
group
)
graph
.
replay
()
assert
torch
.
all
close
(
out1
,
inp1
)
assert
torch
.
all
close
(
out2
,
inp2
)
torch
.
testing
.
assert_
close
(
out1
,
inp1
)
torch
.
testing
.
assert_
close
(
out2
,
inp2
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
out
=
inp
for
_
in
range
(
num_communication
):
out
=
fa
.
all_reduce_unreg
(
out
)
assert
torch
.
all
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
torch
.
testing
.
assert_
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
inp
=
torch
.
ones
(
sz
*
4
,
dtype
=
torch
.
bfloat16
,
device
=
device
)
out
=
inp
for
_
in
range
(
num_communication
):
out
=
fa
.
all_reduce_unreg
(
out
)
assert
torch
.
all
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
torch
.
testing
.
assert_
close
(
out
,
inp
*
(
tp_size
**
num_communication
))
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
...
...
tests/distributed/test_distributed_oot.py
0 → 100644
View file @
ad385667
from
..entrypoints.openai.test_oot_registration
import
(
run_and_test_dummy_opt_api_server
)
def
test_distributed_oot
(
dummy_opt_path
:
str
):
run_and_test_dummy_opt_api_server
(
dummy_opt_path
,
tp
=
2
)
tests/distributed/test_multi_node_assignment.py
0 → 100644
View file @
ad385667
"""Make sure ray assigns GPU workers to the correct node.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_multi_node_assignment.py
```
"""
import
os
import
pytest
import
ray
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
vllm
import
initialize_ray_cluster
from
vllm.config
import
ParallelConfig
from
vllm.executor.ray_utils
import
_wait_until_pg_removed
from
vllm.utils
import
get_ip
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
mark
.
skipif
(
not
VLLM_MULTI_NODE
,
reason
=
"Need at least 2 nodes to run the test."
)
def
test_multi_node_assignment
()
->
None
:
# NOTE: important to keep this class definition here
# to let ray use cloudpickle to serialize it.
class
Actor
:
def
get_ip
(
self
):
return
get_ip
()
for
_
in
range
(
10
):
config
=
ParallelConfig
(
1
,
2
)
initialize_ray_cluster
(
config
)
current_ip
=
get_ip
()
workers
=
[]
for
bundle_id
,
bundle
in
enumerate
(
config
.
placement_group
.
bundle_specs
):
if
not
bundle
.
get
(
"GPU"
,
0
):
continue
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
config
.
placement_group
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_id
,
)
worker
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
1
,
scheduling_strategy
=
scheduling_strategy
,
)(
Actor
).
remote
()
worker_ip
=
ray
.
get
(
worker
.
get_ip
.
remote
())
assert
worker_ip
==
current_ip
workers
.
append
(
worker
)
for
worker
in
workers
:
ray
.
kill
(
worker
)
_wait_until_pg_removed
(
config
.
placement_group
)
tests/distributed/test_pipeline_parallel.py
View file @
ad385667
...
...
@@ -6,47 +6,267 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
to fail.
"""
import
os
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
import
pytest
from
vllm.logger
import
init_logger
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
logger
=
init_logger
(
"test_pipeline_parallel"
)
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
@
pytest
.
mark
.
parametrize
((
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
"MODEL_NAME, DIST_BACKEND"
),
[
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
])
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
,
DIST_BACKEND
):
if
VLLM_MULTI_NODE
and
DIST_BACKEND
==
"mp"
:
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
pp_size
:
int
eager_mode
:
bool
chunked_prefill
:
bool
@
dataclass
class
PPTestSettings
:
parallel_setups
:
List
[
ParallelSetup
]
distributed_backends
:
List
[
str
]
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
@
staticmethod
def
detailed
(
*
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
2
*
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
@
staticmethod
def
fast
(
*
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
eager_mode
=
True
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
],
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
def
iter_params
(
self
,
model_name
:
str
):
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# The values displayed here are only a rough indicator of the size of the model
# yapf: disable
GENERATION_MODEL_SETTINGS
=
{
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B"
:
PPTestSettings
.
detailed
(),
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
,
trust_remote_code
=
True
),
# noqa: E501
"baichuan-inc/Baichuan-7B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"baichuan-inc/Baichuan2-13B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"bigscience/bloomz-1b1"
:
PPTestSettings
.
fast
(),
"THUDM/chatglm3-6b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"CohereForAI/c4ai-command-r-v01"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
trust_remote_code
=
True
),
# noqa: E501
"databricks/dbrx-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
8
),
"Deci/DeciLM-7B-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"deepseek-ai/deepseek-llm-7b-chat"
:
PPTestSettings
.
fast
(),
"deepseek-ai/DeepSeek-V2-Lite-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
:
PPTestSettings
.
fast
(),
"tiiuae/falcon-7b"
:
PPTestSettings
.
fast
(),
"google/gemma-2b"
:
PPTestSettings
.
fast
(),
"google/gemma-2-9b"
:
PPTestSettings
.
fast
(),
"gpt2"
:
PPTestSettings
.
fast
(),
"bigcode/starcoder"
:
PPTestSettings
.
fast
(),
"EleutherAI/gpt-j-6b"
:
PPTestSettings
.
fast
(),
"EleutherAI/pythia-12b"
:
PPTestSettings
.
fast
(),
"ibm/PowerLM-3b"
:
PPTestSettings
.
fast
(),
"ibm/PowerMoE-3b"
:
PPTestSettings
.
fast
(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"core42/jais-13b-chat"
:
PPTestSettings
.
fast
(),
# TODO: Implement PP
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
PPTestSettings
.
fast
(
tp_base
=
4
),
"mosaicml/mpt-7b"
:
PPTestSettings
.
fast
(),
"nvidia/Minitron-8B-Base"
:
PPTestSettings
.
fast
(),
"allenai/OLMoE-1B-7B-0924-Instruct"
:
PPTestSettings
.
fast
(),
"allenai/OLMo-1B-hf"
:
PPTestSettings
.
fast
(),
"facebook/opt-iml-max-1.3b"
:
PPTestSettings
.
fast
(),
"OrionStarAI/Orion-14B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"microsoft/phi-2"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-mini-4k-instruct"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3-small-8k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen-7B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"Qwen/Qwen2-beta-7B-Chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
:
PPTestSettings
.
fast
(),
"stabilityai/stablelm-3b-4e1t"
:
PPTestSettings
.
fast
(),
"bigcode/starcoder2-3b"
:
PPTestSettings
.
fast
(),
"upstage/solar-pro-preview-instruct"
:
PPTestSettings
.
fast
(
tp_base
=
2
),
# FIXME: Cannot load tokenizer in latest transformers version
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}
EMBEDDING_MODEL_SETTINGS
=
{
# type: ignore[var-annotated]
# [FAST TESTS]
"intfloat/e5-mistral-7b-instruct"
:
PPTestSettings
.
fast
(),
"BAAI/bge-multilingual-gemma2"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen2.5-Math-RM-72B"
:
PPTestSettings
.
fast
(
tp_base
=
4
,
trust_remote_code
=
True
),
# noqa: E501
}
MULTIMODAL_MODEL_SETTINGS
=
{
# [FAST TESTS]
"Salesforce/blip2-opt-2.7b"
:
PPTestSettings
.
fast
(),
"facebook/chameleon-7b"
:
PPTestSettings
.
fast
(),
"adept/fuyu-8b"
:
PPTestSettings
.
fast
(),
"OpenGVLab/InternVL2-1B"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"llava-hf/llava-1.5-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-v1.6-mistral-7b-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/LLaVA-NeXT-Video-7B-hf"
:
PPTestSettings
.
fast
(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-Llama3-V-2_5"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"mistralai/Pixtral-12B-2409"
:
PPTestSettings
.
fast
(
tp_base
=
2
,
tokenizer_mode
=
"mistral"
),
# noqa: E501
"Qwen/Qwen-VL-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"Qwen/Qwen2-VL-2B-Instruct"
:
PPTestSettings
.
fast
(),
"fixie-ai/ultravox-v0_3"
:
PPTestSettings
.
fast
(),
}
CONDITIONAL_GENERATION_MODEL_SETTINGS
=
{
# type: ignore[var-annotated]
# [FAST TESTS]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B"
,
"ibm/PowerLM-3b"
,
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct"
,
"BAAI/bge-multilingual-gemma2"
,
# [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B"
,
"microsoft/Phi-3-vision-128k-instruct"
,
"fixie-ai/ultravox-v0_3"
,
]
def
_compare_tp
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
,
"encode"
]
=
"encode"
,
):
tp_size
,
pp_size
,
eager_mode
,
chunked_prefill
=
parallel_setup
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
if
VLLM_MULTI_NODE
and
distributed_backend
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
USE_RAY_ADAG_NCCL
=
0
USE_RAY_ADAG
=
0
pp_args
=
[
common_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
(
distributed_backend
==
"ray"
and
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
):
# Test Ray ADAG for a subset of the tests
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of aDAG issue.
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
else
:
pp_env
=
None
pp_args
=
[
*
common_args
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
str
(
pp_size
),
"--tensor-parallel-size"
,
str
(
TP_SIZE
),
str
(
tp_size
),
"--distributed-executor-backend"
,
DIST_BACKEND
,
distributed_backend
,
]
# compare without pipeline parallelism
...
...
@@ -55,54 +275,103 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
*
common_args
,
"--tensor-parallel-size"
,
str
(
max
(
TP_SIZE
,
2
)),
# We only use 2 GPUs in the CI.
str
(
tp_size
),
"--distributed-executor-backend"
,
"mp"
,
]
if
CHUNKED_PREFILL
:
pp_args
.
append
(
"--enable-chunked-prefill"
)
tp_args
.
append
(
"--enable-chunked-prefill"
)
if
EAGER_MODE
:
pp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
pp_env
=
None
if
USE_RAY_ADAG
:
assert
DIST_BACKEND
==
"ray"
,
(
"Ray ADAG is only supported with Ray distributed backend"
)
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
str
(
int
(
USE_RAY_ADAG_NCCL
)),
}
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
,
pp_env
)
try
:
compare_two_settings
(
model_name
,
pp_args
,
tp_args
,
pp_env
,
method
=
method
)
except
Exception
:
if
pp_env
is
None
:
raise
else
:
# Ray ADAG tests are flaky, so we don't want to fail the test
logger
.
exception
(
"Ray ADAG tests failed"
)
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
(
2
,
"JackFram/llama-160m"
),
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
GENERATION_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
def
test_tp_language_generation
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"generate"
)
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
EMBEDDING_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_language_embedding
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"encode"
)
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
MULTIMODAL_MODEL_SETTINGS
.
items
()
for
params
in
settings
.
iter_params
(
model_name
)
if
model_name
in
TEST_MODELS
],
)
@
fork_new_process_for_each_test
def
test_tp_multimodal_generation
(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
):
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
method
=
"generate"
)
tests/distributed/test_pp_cudagraph.py
0 → 100644
View file @
ad385667
import
os
import
pytest
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
(
2
,
"JackFram/llama-160m"
),
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
@
fork_new_process_for_each_test
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"mp"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
tests/distributed/test_same_node.py
View file @
ad385667
import
os
import
torch
import
torch
.distributed
as
dist
from
vllm.distributed.parallel_state
import
in_the_same_node_as
torch
.
distributed
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
all
(
in_the_same_node_as
(
torch
.
distributed
.
group
.
WORLD
,
source_rank
=
0
))
if
__name__
==
"__main__"
:
dist
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
all
(
in_the_same_node_as
(
dist
.
group
.
WORLD
,
source_rank
=
0
))
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
print
(
"Same node test passed!"
)
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
print
(
"Same node test passed!"
)
tests/encoder_decoder/__init__.py
0 → 100644
View file @
ad385667
tests/encoder_decoder/test_e2e_correctness.py
0 → 100644
View file @
ad385667
"""E2E tests to verify the correctness of the encoder-decoder framework
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"</s>"
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
hf_output_str
=
"<s>"
+
hf_output_str
return
output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
is_cpu
(),
reason
=
"CPU backend is not currently supported with encoder/decoder models"
)
def
test_encoder_decoder_e2e
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
decoder_prompt_type
:
DecoderPromptType
,
enforce_eager
:
bool
,
)
->
None
:
'''
End-to-End (E2E) test for the encoder-decoder framework.
This test evaluates the encoder-decoder functionality using the BART
model. We compare the outputs of the Hugging Face and vLLM
implementations to ensure that both implementations produce consistent
and correct results.
'''
test_case_prompts
=
example_encoder_decoder_prompts
[
decoder_prompt_type
]
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_case_prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_case_prompts
,
max_tokens
,
num_logprobs
)
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
tests/engine/test_arg_utils.py
0 → 100644
View file @
ad385667
from
argparse
import
ArgumentTypeError
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
,
nullable_kvs
from
vllm.utils
import
FlexibleArgumentParser
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
),
[
(
None
,
None
),
(
"image=16"
,
{
"image"
:
16
}),
(
"image=16,video=2"
,
{
"image"
:
16
,
"video"
:
2
}),
(
"Image=16, Video=2"
,
{
"image"
:
16
,
"video"
:
2
}),
])
def
test_limit_mm_per_prompt_parser
(
arg
,
expected
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
else
:
args
=
parser
.
parse_args
([
"--limit-mm-per-prompt"
,
arg
])
assert
args
.
limit_mm_per_prompt
==
expected
@
pytest
.
mark
.
parametrize
(
(
"arg"
),
[
"image"
,
# Missing =
"image=4,image=5"
,
# Conflicting values
"image=video=4"
# Too many = in tokenized arg
])
def
test_bad_nullable_kvs
(
arg
):
with
pytest
.
raises
(
ArgumentTypeError
):
nullable_kvs
(
arg
)
# yapf: disable
@
pytest
.
mark
.
parametrize
((
"arg"
,
"expected"
,
"option"
),
[
(
None
,
None
,
"mm-processor-kwargs"
),
(
"{}"
,
{},
"mm-processor-kwargs"
),
(
'{"num_crops": 4}'
,
{
"num_crops"
:
4
},
"mm-processor-kwargs"
),
(
'{"foo": {"bar": "baz"}}'
,
{
"foo"
:
{
"bar"
:
"baz"
}
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}'
,
{
"cast_logits_dtype"
:
"bfloat16"
,
"sequence_parallel_norm"
:
True
,
"sequence_parallel_norm_threshold"
:
2048
,
},
"override-neuron-config"
),
])
# yapf: enable
def
test_composite_arg_parser
(
arg
,
expected
,
option
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
else
:
args
=
parser
.
parse_args
([
f
"--
{
option
}
"
,
arg
])
assert
getattr
(
args
,
option
.
replace
(
"-"
,
"_"
))
==
expected
tests/engine/test_custom_executor.py
View file @
ad385667
...
...
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmp
dir
):
def
test_custom_executor
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
...
...
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmp
dir
):
def
test_custom_executor_async
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
...
...
tests/engine/test_multiproc_workers.py
View file @
ad385667
...
...
@@ -83,7 +83,7 @@ def test_local_workers() -> None:
workers
[
3
].
process
.
kill
()
# Other workers should get shut down here
worker_monitor
.
join
(
2
)
worker_monitor
.
join
(
2
0
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
...
...
@@ -108,7 +108,7 @@ def test_local_workers_clean_shutdown() -> None:
# Clean shutdown
worker_monitor
.
close
()
worker_monitor
.
join
(
5
)
worker_monitor
.
join
(
20
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
...
...
@@ -161,7 +161,7 @@ async def test_local_workers_async() -> None:
workers
[
3
].
process
.
kill
()
# Other workers should get shut down here
worker_monitor
.
join
(
2
)
worker_monitor
.
join
(
2
0
)
# Ensure everything is stopped
assert
not
worker_monitor
.
is_alive
()
...
...
tests/engine/test_skip_tokenizer_init.py
View file @
ad385667
...
...
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
# token ids.
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
)
as
err
:
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
llm
.
generate
(
"abc"
,
sampling_params
)
assert
"prompts must be None if"
in
str
(
err
.
value
)
outputs
=
llm
.
generate
({
"prompt_token_ids"
:
[
1
,
2
,
3
]},
sampling_params
=
sampling_params
)
assert
len
(
outputs
)
>
0
...
...
tests/engine/test_stop_strings.py
View file @
ad385667
...
...
@@ -7,6 +7,8 @@ from vllm import CompletionOutput, LLMEngine, SamplingParams
MODEL
=
"meta-llama/llama-2-7b-hf"
MAX_TOKENS
=
200
IS_ASYNC
=
False
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_model
(
vllm_runner
):
...
...
@@ -14,99 +16,148 @@ def vllm_model(vllm_runner):
yield
vllm_model
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_basic
(
vllm_model
):
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
def
_test_stopping
(
llm_engine
:
LLMEngine
,
expected_output
:
str
,
expected_reason
:
Any
,
stop
:
Optional
[
List
[
str
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
include_in_output
:
bool
=
False
,
use_async_output_proc
:
bool
=
False
)
->
None
:
llm_engine
.
add_request
(
"id"
,
"A story about vLLM:
\n
"
,
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
MAX_TOKENS
,
stop
=
stop
,
stop_token_ids
=
stop_token_ids
,
include_stop_str_in_output
=
include_in_output
,
),
None
)
output
:
Optional
[
CompletionOutput
]
=
None
output_text
=
""
stop_reason
=
None
if
use_async_output_proc
:
llm_engine
.
step
()
while
llm_engine
.
has_unfinished_requests
():
(
request_output
,
)
=
llm_engine
.
step
()
(
output
,
)
=
request_output
.
outputs
# Ensure we don't backtrack
assert
output
.
text
.
startswith
(
output_text
)
output_text
=
output
.
text
stop_reason
=
output
.
stop_reason
assert
output
is
not
None
assert
output_text
==
expected_output
assert
stop_reason
==
expected_reason
def
_set_async_mode
(
llm_engine
,
is_async
):
llm_engine
.
scheduler
[
0
].
use_async_output_proc
=
is_async
def
_stop_basic
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
"."
)
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop
=
[
"."
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization."
,
expected_reason
=
"."
)
expected_reason
=
"."
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_multi_tokens
(
vllm_model
):
def
_stop_multi_tokens
(
llm_engine
,
is_async
):
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a "
,
expected_reason
=
"group of peo"
)
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
llm_engine
,
stop
=
[
"group of peo"
,
"short"
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization. We are a group of peo"
,
expected_reason
=
"group of peo"
)
expected_reason
=
"group of peo"
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_partial_token
(
vllm_model
):
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
def
_stop_partial_token
(
llm_engine
,
is_async
):
_test_stopping
(
llm_engine
,
stop
=
[
"gani"
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer or"
,
expected_reason
=
"gani"
)
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop
=
[
"gani"
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organi"
,
expected_reason
=
"gani"
)
expected_reason
=
"gani"
,
use_async_output_proc
=
is_async
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_token_id
(
vllm_model
):
def
_stop_token_id
(
llm_engine
,
is_async
):
# token id 13013 => " organization"
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
include_in_output
=
False
,
expected_output
=
"VLLM is a 100% volunteer"
,
expected_reason
=
13013
)
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
_test_stopping
(
vllm_model
.
model
.
llm_engine
,
_test_stopping
(
llm_engine
,
stop_token_ids
=
[
13013
],
include_in_output
=
True
,
expected_output
=
"VLLM is a 100% volunteer organization"
,
expected_reason
=
13013
)
expected_reason
=
13013
,
use_async_output_proc
=
is_async
)
def
_test_stopping
(
llm_engine
:
LLMEngine
,
expected_output
:
str
,
expected_reason
:
Any
,
stop
:
Optional
[
List
[
str
]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
include_in_output
:
bool
=
False
)
->
None
:
llm_engine
.
add_request
(
"id"
,
"A story about vLLM:
\n
"
,
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
MAX_TOKENS
,
stop
=
stop
,
stop_token_ids
=
stop_token_ids
,
include_stop_str_in_output
=
include_in_output
,
),
None
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_basic
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
output
:
Optional
[
CompletionOutput
]
=
None
output_text
=
""
stop_reason
=
None
while
llm_engine
.
has_unfinished_requests
():
(
request_output
,
)
=
llm_engine
.
step
()
(
output
,
)
=
request_output
.
outputs
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_basic
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
# Ensure we don't backtrack
assert
output
.
text
.
startswith
(
output_text
)
output_text
=
output
.
text
stop_reason
=
output
.
stop_reason
assert
output
is
not
None
assert
output_text
==
expected_output
assert
stop_reason
==
expected_reason
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_multi_tokens
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_multi_tokens
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_partial_token
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_partial_token
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_token_id
(
vllm_model
):
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
True
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
True
)
_set_async_mode
(
vllm_model
.
model
.
llm_engine
,
False
)
_stop_token_id
(
vllm_model
.
model
.
llm_engine
,
is_async
=
False
)
Prev
1
…
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment