Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
705f6a35
Commit
705f6a35
authored
Jul 16, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1
parents
af837396
4cf256ae
Changes
439
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
651 additions
and
146 deletions
+651
-146
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+3
-3
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+69
-41
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+6
-6
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+26
-26
tests/core/utils.py
tests/core/utils.py
+7
-5
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+18
-17
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+18
-11
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+91
-7
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+8
-8
tests/distributed/test_multimodal_broadcast.py
tests/distributed/test_multimodal_broadcast.py
+54
-0
tests/distributed/test_parallel_state.py
tests/distributed/test_parallel_state.py
+57
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+140
-0
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+23
-10
tests/distributed/test_same_node.py
tests/distributed/test_same_node.py
+4
-2
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+88
-0
tests/distributed/test_utils.py
tests/distributed/test_utils.py
+33
-0
tests/engine/output_processor/test_multi_step.py
tests/engine/output_processor/test_multi_step.py
+4
-4
tests/entrypoints/llm/__init__.py
tests/entrypoints/llm/__init__.py
+0
-0
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+1
-3
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+1
-3
No files found.
Too many changes to show.
To preserve performance only
439 of 439+
files are displayed.
Plain diff
Email patch
tests/core/block/test_naive_block.py
View file @
705f6a35
...
...
@@ -14,11 +14,11 @@ class TestNaiveBlockAllocator:
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable
(
allocate_block
=
lambda
:
allocator
.
allocate_immutable
_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
elif
allocate_type
==
"mutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_mutable
(
prev
_block
=
prev_block
)
allocate_block
=
lambda
:
allocator
.
allocate_mutable_block
(
prev_block
=
prev_block
)
else
:
raise
ValueError
()
...
...
tests/core/block/test_prefix_caching_block.py
View file @
705f6a35
...
...
@@ -26,11 +26,10 @@ class TestPrefixCachingBlock:
token_ids
=
list
(
range
(
num_to_fill
))
mock_allocator
=
MagicMock
(
spec
=
PrefixCachingBlockAllocator
)
block_with_prev
=
PrefixCachingBlock
(
prev_block
=
None
,
token_ids
=
token_ids
,
block_size
=
block_size
,
prefix_caching_allocator
=
mock_allocator
)
block_with_prev
=
PrefixCachingBlock
(
prev_block
=
None
,
token_ids
=
token_ids
,
block_size
=
block_size
,
allocator
=
mock_allocator
)
if
is_curr_block_full
:
# Expect hash since block is full.
...
...
@@ -71,7 +70,7 @@ class TestPrefixCachingBlock:
prev_block
=
previous_block
,
token_ids
=
token_ids
,
block_size
=
block_size
,
prefix_caching_
allocator
=
mock_allocator
,
allocator
=
mock_allocator
,
)
if
is_curr_block_full
and
prev_block_has_hash
:
...
...
@@ -123,7 +122,7 @@ class TestPrefixCachingBlock:
num_empty_trailing_blocks
=
0
)
->
List
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
=
[]
blocks
:
List
[
PrefixCachingBlock
]
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
+
num_empty_trailing_blocks
...
...
@@ -138,7 +137,7 @@ class TestPrefixCachingBlock:
prev_block
=
prev_block
,
token_ids
=
[],
block_size
=
block_size
,
prefix_caching_
allocator
=
allocator
,
allocator
=
allocator
,
)
tokens_to_append
=
token_ids
[
block_number
*
...
...
@@ -159,11 +158,11 @@ class TestPrefixCachingBlockAllocator:
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable
(
allocate_block
=
lambda
:
allocator
.
allocate_immutable
_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
elif
allocate_type
==
"mutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_mutable
(
prev
_block
=
prev_block
)
allocate_block
=
lambda
:
allocator
.
allocate_mutable_block
(
prev_block
=
prev_block
)
else
:
raise
ValueError
()
...
...
@@ -233,12 +232,13 @@ class TestPrefixCachingBlockAllocator:
# Expect allocation with unseen hash to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_immutable
(
prev_block
=
chain
[
-
1
],
token_ids
=
list
(
range
(
block_size
)))
allocator
.
allocate_immutable_block
(
prev_block
=
chain
[
-
1
],
token_ids
=
list
(
range
(
block_size
)))
# Expect mutable allocation to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
chain
[
-
1
])
allocator
.
allocate_mutable
_block
(
prev_block
=
chain
[
-
1
])
# Expect allocation of exact same chain to pass.
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
...
...
@@ -270,7 +270,7 @@ class TestPrefixCachingBlockAllocator:
# Expect mutable allocation to fail.
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
None
)
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
block_to_free
=
chain
[
-
1
]
...
...
@@ -280,11 +280,11 @@ class TestPrefixCachingBlockAllocator:
allocator
.
free
(
block_to_free
)
assert
block_to_free
.
block_id
is
None
,
i
new_block
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
new_block
=
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
assert
new_block
.
block_id
==
block_id
,
i
with
pytest
.
raises
(
BlockAllocator
.
NoFreeBlocksError
):
allocator
.
allocate_mutable
(
prev_block
=
None
)
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
block_to_free
=
new_block
...
...
@@ -376,7 +376,6 @@ class TestPrefixCachingBlockAllocator:
# Create token ids that will exhaust all blocks.
token_ids
=
list
(
range
(
num_blocks_to_consume
*
block_size
))
blocks
=
list
(
range
(
num_blocks_to_consume
))
first_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
...
...
@@ -384,9 +383,6 @@ class TestPrefixCachingBlockAllocator:
allocator
=
allocator
,
)
# mark all blocks in first chain as computed
allocator
.
mark_blocks_as_computed
(
blocks
)
# After zero_point, second_chain's token_ids would be set -1, which
# make it different from here comparing with first_chain
zero_point
=
random
.
randint
(
1
,
len
(
token_ids
)
-
1
)
...
...
@@ -424,15 +420,16 @@ class TestPrefixCachingBlockAllocator:
block_size
=
block_size
)
token_ids
=
list
(
range
(
block_size
))
block
=
allocator
.
allocate_immutable
(
prev_block
=
None
,
token_ids
=
token_ids
)
block
=
allocator
.
allocate_immutable
_block
(
prev_block
=
None
,
token_ids
=
token_ids
)
assert
allocator
.
_refcounter
.
get
(
block
.
block_id
)
==
1
m
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
m
=
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
block_id
=
m
.
block_id
for
i
in
range
(
block_size
):
m
.
append_token_ids
([
i
])
# After block get promoted to immutable from mutable, if there is
# already same content hash block, then it shall be released into
# hashless_allocator
...
...
@@ -452,48 +449,79 @@ class TestPrefixCachingBlockAllocator:
all_blocks_list
=
[
i
for
i
in
range
(
num_blocks
)]
zero_ref
=
{
i
:
0
for
i
in
range
(
num_blocks
)}
one_ref
=
{
i
:
1
for
i
in
range
(
num_blocks
)}
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
token_ids
=
list
(
range
(
num_blocks
*
block_size
))
#
now we have num_blocks free blocks in hashless allocator
# with internal tracking list _blocks _cached_blocks and evictor
#
empty and
block
'
s re
f shall be 0
#
Verify initial/pre-alloc state
#
Ensure all
blocks
a
re
free inside hashless allocator
assert
list
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
all_blocks_list
assert
len
(
allocator
.
_blocks
.
keys
())
==
0
# Ensure no tracked blocks
assert
len
(
allocator
.
_block_tracker
.
keys
())
==
num_blocks
for
block_id
in
range
(
num_blocks
):
assert
not
allocator
.
_block_tracker
[
block_id
].
active
# Ensure no cached blocks
assert
len
(
allocator
.
_cached_blocks
.
values
())
==
0
# Ensure no evicted blocks
assert
len
(
allocator
.
evictor
.
free_table
.
keys
())
==
0
# Ensure 0s ref counts for all blocks
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
# Allocate immutable chains with only one block residuled in
new_block
=
[]
for
i
in
range
(
num_blocks
):
block
=
allocator
.
allocate_immutable
(
block
=
allocator
.
allocate_immutable
_block
(
prev_block
=
None
,
token_ids
=
token_ids
[
block_size
*
i
:
block_size
*
(
i
+
1
)])
new_block
.
append
(
block
)
# Verify post-alloc state
# Ensure no blocks are free inside hashless allocator
assert
(
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
)
# Ensure all blocks are tracked
assert
len
(
allocator
.
_block_tracker
.
keys
())
==
num_blocks
for
block_id
in
range
(
num_blocks
):
assert
allocator
.
_block_tracker
[
block_id
].
active
# Ensure all blocks are cached (all promoted)
assert
len
(
allocator
.
_cached_blocks
.
values
())
==
num_blocks
# Ensure no evicted blocks
assert
len
(
allocator
.
evictor
.
free_table
.
keys
())
==
0
# Ensure 1s ref counts for all blocks
assert
allocator
.
_refcounter
.
_refcounts
==
one_ref
# Free all blocks, and now all blocks shall be in the evictor
# there shall be no tracking data left in _block
s
# there shall be no tracking data left in _block
_tracker
# all blocks shall be tracked in _cached_blocks
# all blocks' ref shall be zero
for
block
in
new_block
:
allocator
.
free
(
block
)
assert
len
(
allocator
.
_blocks
.
keys
())
==
0
# Verify post-free state
# Ensure no tracked blocks
assert
len
(
allocator
.
_block_tracker
.
keys
())
==
num_blocks
for
block_id
in
range
(
num_blocks
):
assert
not
allocator
.
_block_tracker
[
block_id
].
active
# Ensure no blocks in hashless allocator (all promoted)
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
# Ensure all blocks are cached
assert
list
(
allocator
.
_cached_blocks
.
values
())
==
all_blocks_list
# Ensure all blocks are inside the evictor
assert
list
(
allocator
.
evictor
.
free_table
.
keys
())
==
all_blocks_list
# Ensure 0s refcounts
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
# Allocate a mutable block, and the first block shall be evicted
# and set its content hash into None, ref to 1
mutable
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
mutable
=
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
assert
mutable
.
block_id
==
0
assert
mutable
.
content_hash
is
None
assert
0
in
allocator
.
_block
s
assert
allocator
.
_block
_tracker
[
0
].
active
assert
allocator
.
_refcounter
.
get
(
0
)
==
1
assert
0
not
in
allocator
.
_cached_blocks
assert
0
not
in
allocator
.
evictor
...
...
@@ -502,27 +530,27 @@ class TestPrefixCachingBlockAllocator:
# hashless allocator
allocator
.
free
(
mutable
)
assert
len
(
allocator
.
_block
s
.
keys
())
==
0
assert
not
allocator
.
_block
_tracker
[
0
].
active
assert
allocator
.
_refcounter
.
_refcounts
==
zero_ref
assert
0
not
in
allocator
.
_cached_blocks
assert
0
not
in
allocator
.
evictor
assert
0
in
allocator
.
_hashless_allocator
.
_free_block_indices
#
w
hen allocate immutable with first block_size tokens, we
#
W
hen allocate immutable with first block_size tokens, we
# shall get free block from hashless allocator, thus no block left
# in hashless
block
=
allocator
.
allocate_immutable
(
prev
_block
=
None
,
token_ids
=
token_ids
[:
block_size
])
block
=
allocator
.
allocate_immutable_block
(
prev_block
=
None
,
token_ids
=
token_ids
[:
block_size
])
assert
block
.
block_id
==
0
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
assert
0
in
allocator
.
_block
s
assert
allocator
.
_block
_tracker
[
0
].
active
assert
0
in
allocator
.
_cached_blocks
.
values
()
assert
allocator
.
_refcounter
.
get
(
0
)
==
1
assert
0
not
in
allocator
.
evictor
# allocate mutable block again, it shall be popped from evictor
mutable
=
allocator
.
allocate_mutable
(
prev_block
=
None
)
mutable
=
allocator
.
allocate_mutable
_block
(
prev_block
=
None
)
assert
len
(
allocator
.
_hashless_allocator
.
_free_block_indices
)
==
0
assert
mutable
.
block_id
not
in
allocator
.
evictor
.
free_table
assert
allocator
.
_refcounter
.
get
(
mutable
.
block_id
)
==
1
...
...
@@ -608,7 +636,7 @@ class TestPrefixCachingBlockAllocator:
)
->
List
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
=
[]
blocks
:
List
[
Block
]
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
if
num_blocks
==
0
:
...
...
@@ -619,7 +647,7 @@ class TestPrefixCachingBlockAllocator:
block_token_ids
=
token_ids
[
block_number
*
block_size
:(
block_number
+
1
)
*
block_size
]
prev_block
=
allocator
.
allocate_immutable
(
prev_block
=
allocator
.
allocate_immutable
_block
(
prev_block
=
prev_block
,
token_ids
=
block_token_ids
)
blocks
.
append
(
prev_block
)
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
705f6a35
...
...
@@ -149,7 +149,7 @@ def test_complex():
# Only the first seq group has a new token appended.
append_new_token
(
running
[
0
],
1
)
# Add 2 more requ
s
ets.
# Add 2 more reque
s
ts.
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group
)
...
...
@@ -483,11 +483,11 @@ def test_chunked_prefill_preempt():
# The request should be preempted.
scheduler
.
block_manager
.
can_append_slots
=
MagicMock
()
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
def
cannot_append_second_group
1
(
seq_group
,
num_lookahead_slots
):
return
seq_group
.
request_id
!=
"1"
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
cannot_append_second_group
1
)
# The running prefill is now preempted.
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
...
...
@@ -505,11 +505,11 @@ def test_chunked_prefill_preempt():
assert
seq_group
.
get_num_uncomputed_tokens
()
==
30
# We should be able to run prefill twice as it is chunked.
def
cannot_append_second_group
(
seq_group
,
num_lookahead_slots
):
def
cannot_append_second_group
2
(
seq_group
,
num_lookahead_slots
):
return
True
scheduler
.
block_manager
.
can_append_slots
.
side_effect
=
(
cannot_append_second_group
)
cannot_append_second_group
2
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
out
.
num_prefill_groups
==
1
...
...
@@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
=
[]
running
:
List
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
)
scheduler
.
add_seq_group
(
seq_group
)
...
...
tests/core/test_scheduler.py
View file @
705f6a35
import
time
from
collections
import
deque
from
typing
import
List
from
typing
import
Deque
,
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
import
pytest
# noqa
...
...
@@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group():
# Add multiple seq groups to scheduler.
num_seq_group
=
4
request_ids
=
set
()
request_ids
:
Set
[
str
]
=
set
()
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
...
...
@@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len():
Test prompt longer than max_prompt_len is aborted.
"""
scheduler
=
initialize_scheduler
(
max_model_len
=
30
)
_
,
seq_group
=
create_dummy_prompt
(
0
,
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
)
waiting
=
deque
([
seq_group
])
budget
=
create_token_budget
()
remaining_waiting
,
output
=
scheduler
.
_schedule_prefills
(
...
...
@@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget():
Test token budget respected.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
waiting
:
Deque
[
SequenceGroup
]
=
deque
()
budget
=
create_token_budget
(
token_budget
=
0
)
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
...
...
@@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs():
Test max seq respected.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
waiting
:
Deque
[
SequenceGroup
]
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
...
...
@@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora():
"""
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
waiting
=
deque
()
waiting
:
Deque
[
SequenceGroup
]
=
deque
()
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity():
Test sequence cannot be scheduled due to block manager has no capacity.
"""
scheduler
=
initialize_scheduler
()
waiting
=
deque
()
waiting
:
Deque
[
SequenceGroup
]
=
deque
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
...
...
@@ -536,7 +536,7 @@ def test_decode_schedule_preempted():
Test decodes cannot be scheduled and preempted.
"""
scheduler
=
initialize_scheduler
()
running
=
deque
()
running
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
for
i
in
range
(
3
):
...
...
@@ -577,7 +577,7 @@ def test_decode_swap_beam_search():
Test best_of > 1 swap out blocks
"""
scheduler
=
initialize_scheduler
()
running
=
deque
()
running
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
budget
=
create_token_budget
()
...
...
@@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update():
"""
scheduler
=
initialize_scheduler
()
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
running
=
deque
()
running
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update():
def
test_schedule_swapped_simple
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
...
@@ -683,10 +683,10 @@ def test_schedule_swapped_simple():
def
test_schedule_swapped_max_token_budget
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget():
def
test_schedule_swapped_max_seqs
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs():
def
test_schedule_swapped_max_loras
():
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
set
()
blocks_to_swap_out
=
[]
curr_loras
:
Set
[
int
]
=
set
()
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras():
def
test_schedule_swapped_cannot_swap_in
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in():
def
test_infeasible_swap
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
...
...
@@ -834,13 +834,13 @@ def test_infeasible_swap():
def
test_schedule_swapped_blocks_to_copy
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
...
...
tests/core/utils.py
View file @
705f6a35
import
time
from
typing
import
Iterable
,
Optional
,
Tuple
from
typing
import
List
,
Optional
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Tuple
from
vllm
import
SamplingParams
from
vllm.lora.request
import
LoRARequest
...
...
@@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder(
lora_request
:
Optional
[
LoRARequest
]
=
None
,
use_beam_search
:
bool
=
False
,
best_of
:
int
=
1
,
)
->
Tuple
[
Sequence
,
SequenceGroup
]:
)
->
Tuple
[
Sequence
,
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
decoder_prompt_length
...
...
@@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder(
def
create_seq_group
(
seq_prompt_len
:
int
=
1024
,
seq_output_lens
:
Iterabl
e
[
int
]
=
(
128
,
),
seq_output_lens
:
GenericSequenc
e
[
int
]
=
(
128
,
),
request_id
:
str
=
'0'
,
seq_id_start
:
int
=
0
,
sampling_params
:
Optional
[
SamplingParams
]
=
None
)
->
SequenceGroup
:
...
...
@@ -98,7 +100,7 @@ def create_seq_group(
prompt_token_ids
=
[
0
]
*
seq_prompt_len
seqs
=
[]
seqs
:
List
[
Sequence
]
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
...
...
@@ -125,7 +127,7 @@ def create_seq_group(
def
create_seq_group_encoder_decoder
(
seq_prompt_len
:
int
=
1024
,
seq_output_lens
:
Iterabl
e
[
int
]
=
(
128
,
),
seq_output_lens
:
GenericSequenc
e
[
int
]
=
(
128
,
),
request_id
:
str
=
'0'
,
seq_id_start
:
int
=
0
,
sampling_params
:
Optional
[
SamplingParams
]
=
None
)
->
SequenceGroup
:
...
...
tests/distributed/test_basic_distributed_correctness.py
View file @
705f6a35
...
...
@@ -15,16 +15,18 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
import
os
import
pytest
import
torch
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
]
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
VLLM_ATTENTION_BACKEND
=
"VLLM_ATTENTION_BACKEND"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
cuda
_
device_count
_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
@@ -39,24 +41,23 @@ def test_models(
)
->
None
:
distributed_executor_backend
=
os
.
getenv
(
DISTRIBUTED_EXECUTOR_BACKEND
)
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
enforce_eager
=
backend_by_env_var
==
"FLASHINFER"
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_chunked_prefill_distributed.py
View file @
705f6a35
...
...
@@ -14,7 +14,10 @@ TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
import
os
import
pytest
import
torch
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
...
...
@@ -22,7 +25,7 @@ MODELS = [
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
cuda
_
device_count
_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
@@ -45,8 +48,10 @@ def test_models(
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
...
...
@@ -59,10 +64,12 @@ def test_models(
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_comm_ops.py
View file @
705f6a35
...
...
@@ -8,12 +8,11 @@ import pytest
import
ray
import
torch
from
vllm.distributed
import
(
broadcast_tensor_dict
,
from
vllm.distributed
import
(
broadcast_tensor_dict
,
get_pp_group
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
from
..utils
import
(
init_test_distributed_environment
,
multi_process_tensor_parallel
)
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -33,7 +32,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
(
r
+
1
)
for
r
in
range
(
tp_size
)
]
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
rank
]
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_reduce
(
t
)
assert
torch
.
allclose
(
t
,
expected
)
...
...
@@ -61,7 +60,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
for
r
in
range
(
tp_size
)
]
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
rank
]
t
=
all_tensors
[
rank
%
tp_size
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
assert
torch
.
allclose
(
t
,
expected
)
...
...
@@ -92,7 +91,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
"f"
:
torch
.
tensor
([],
dtype
=
torch
.
float32
,
device
=
"cuda"
),
}
if
rank
==
0
:
if
(
rank
%
tp_size
)
==
0
:
broadcast_tensor_dict
(
test_dict
,
src
=
0
)
else
:
recv_dict
=
broadcast_tensor_dict
(
src
=
0
)
...
...
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
assert
torch
.
allclose
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
test_dict
=
{
# device tensor
"a"
:
torch
.
arange
(
8
,
dtype
=
torch
.
float32
,
device
=
"cuda"
),
# CPU tensor
"b"
:
torch
.
arange
(
16
,
dtype
=
torch
.
int8
,
device
=
"cpu"
),
"c"
:
"test"
,
"d"
:
[
1
,
2
,
3
],
"e"
:
{
"a"
:
1
,
"b"
:
2
},
# empty tensor
"f"
:
torch
.
tensor
([],
dtype
=
torch
.
float32
,
device
=
"cuda"
),
}
if
not
get_pp_group
().
is_first_rank
:
recv_dict
=
get_pp_group
().
recv_tensor_dict
()
if
not
get_pp_group
().
is_last_rank
:
get_pp_group
().
send_tensor_dict
(
test_dict
)
if
not
get_pp_group
().
is_first_rank
:
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
allclose
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
allclose
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
assert
torch
.
allclose
(
recv_dict
[
"f"
],
test_dict
[
"f"
])
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
size
=
64
test_tensor
=
torch
.
arange
(
64
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
if
not
get_pp_group
().
is_first_rank
:
recv_tensor
=
get_pp_group
().
recv
(
size
,
dtype
=
torch
.
float32
)
if
not
get_pp_group
().
is_last_rank
:
get_pp_group
().
send
(
test_tensor
)
if
not
get_pp_group
().
is_first_rank
:
assert
torch
.
allclose
(
test_tensor
,
recv_tensor
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
...
...
@@ -113,4 +174,27 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
broadcast_tensor_dict_test_worker
])
def
test_multi_process_tensor_parallel
(
tp_size
,
test_target
):
multi_process_tensor_parallel
(
tp_size
,
1
,
test_target
)
multi_process_parallel
(
tp_size
,
1
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
send_recv_test_worker
,
send_recv_tensor_dict_test_worker
])
def
test_multi_process_pipeline_parallel
(
pp_size
,
test_target
):
multi_process_parallel
(
1
,
pp_size
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
send_recv_test_worker
,
send_recv_tensor_dict_test_worker
,
all_reduce_test_worker
,
all_gather_test_worker
,
broadcast_tensor_dict_test_worker
])
def
test_multi_process_tensor_parallel_pipeline_parallel
(
tp_size
,
pp_size
,
test_target
):
multi_process_parallel
(
tp_size
,
pp_size
,
test_target
)
tests/distributed/test_custom_all_reduce.py
View file @
705f6a35
...
...
@@ -7,12 +7,12 @@ import torch
import
torch.distributed
as
dist
from
vllm.distributed.communication_op
import
(
# noqa
graph_capture
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
)
from
vllm.distributed.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tp_
ca_communicator
)
get_tp_
group
,
graph_capture
)
from
..utils
import
(
init_test_distributed_environment
,
multi_process_
tensor_
parallel
)
from
..utils
import
(
ensure_model_parallel_initialized
,
init_test_distributed_environment
,
multi_process_parallel
)
random
.
seed
(
42
)
test_sizes
=
[
random
.
randint
(
1024
,
2048
*
1024
)
for
_
in
range
(
8
)]
...
...
@@ -27,8 +27,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
group
=
get_tensor_model_parallel_group
()
ensure_model_parallel_initialized
(
tp_size
,
pp_size
)
group
=
get_tensor_model_parallel_group
()
.
device_group
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily
...
...
@@ -91,7 +91,7 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
# communicate independently
num_communication
=
rank
//
tp_size
+
1
sz
=
1024
fa
=
get_tp_
ca_communicator
()
fa
=
get_tp_
group
().
ca_comm
inp
=
torch
.
ones
(
sz
,
dtype
=
torch
.
float32
,
device
=
device
)
out
=
inp
for
_
in
range
(
num_communication
):
...
...
@@ -112,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
multi_process_
tensor_
parallel
(
tp_size
,
pipeline_parallel_size
,
test_target
)
multi_process_parallel
(
tp_size
,
pipeline_parallel_size
,
test_target
)
tests/distributed/test_multimodal_broadcast.py
0 → 100644
View file @
705f6a35
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
The second test will hang if more than one test is run per command, so we need
to run the tests one by one. The solution is to pass arguments (model name) by
environment variables.
Run:
```sh
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf
\
test_multimodal_broadcast.py
TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct
\
test_multimodal_broadcast.py
```
"""
import
os
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
model
=
os
.
environ
[
"TEST_DIST_MODEL"
]
if
model
.
startswith
(
"llava-hf/llava"
):
from
..models.test_llava
import
models
,
run_test
elif
model
.
startswith
(
"microsoft/Phi-3-vision"
):
from
..models.test_phi3v
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
tensor_parallel_size
:
int
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
if
cuda_device_count_stateless
()
<
tensor_parallel_size
:
pytest
.
skip
(
f
"Need at least
{
tensor_parallel_size
}
GPUs to run the test."
)
distributed_executor_backend
=
os
.
getenv
(
"DISTRIBUTED_EXECUTOR_BACKEND"
)
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
=
models
[
0
],
size_factors
=
[
1.0
],
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/distributed/test_parallel_state.py
0 → 100644
View file @
705f6a35
from
typing
import
Any
,
Dict
import
pytest
import
torch
from
vllm.distributed.parallel_state
import
(
_split_tensor_dict
,
_update_nested_dict
)
def
test_split_tensor_dict
():
test_dict
=
{
"key_a"
:
"a"
,
"key_b"
:
torch
.
arange
(
8
,
dtype
=
torch
.
float32
),
"key_c"
:
{
"key_1"
:
torch
.
arange
(
5
,
dtype
=
torch
.
float32
),
"key_2"
:
torch
.
tensor
([],
dtype
=
torch
.
float32
),
"key_3"
:
123
,
},
"key_d"
:
{},
}
metadata_list
,
tensor_list
=
_split_tensor_dict
(
test_dict
)
assert
len
(
metadata_list
)
==
6
assert
torch
.
allclose
(
tensor_list
[
0
],
test_dict
[
"key_b"
])
assert
torch
.
allclose
(
tensor_list
[
1
],
test_dict
[
"key_c"
][
"key_1"
])
assert
torch
.
allclose
(
tensor_list
[
2
],
test_dict
[
"key_c"
][
"key_2"
])
def
test_split_tensor_dict_invalid_key
():
test_dict
=
{
"a%b"
:
"a"
,
}
with
pytest
.
raises
(
AssertionError
):
_split_tensor_dict
(
test_dict
)
def
test_update_nested_dict
():
flattened_keys_values
=
[(
"key1%key2%key3"
,
"value1"
),
(
"key1%key2%key4"
,
"value2"
),
(
"key1%key5"
,
"value3"
),
(
"key6%key7"
,
"value4"
),
(
"key8"
,
"value5"
)]
res
:
Dict
[
str
,
Any
]
=
{}
for
flat_key
,
value
in
flattened_keys_values
:
_update_nested_dict
(
res
,
flat_key
,
value
)
assert
res
==
{
"key1"
:
{
"key2"
:
{
"key3"
:
"value1"
,
"key4"
:
"value2"
},
"key5"
:
"value3"
},
"key6"
:
{
"key7"
:
"value4"
},
"key8"
:
"value5"
}
tests/distributed/test_pipeline_parallel.py
0 → 100644
View file @
705f6a35
import
os
import
openai
# use the official client for correctness check
import
pytest
from
..utils
import
RemoteOpenAIServer
# downloading lora to test lora requests
# any model with a chat template should work here
MODEL_NAME
=
"meta-llama/Meta-Llama-3-8B"
EAGER_MODE
=
bool
(
int
(
os
.
getenv
(
"EAGER_MODE"
,
0
)))
CHUNKED_PREFILL
=
bool
(
int
(
os
.
getenv
(
"CHUNKED_PREFILL"
,
0
)))
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
))
PP_SIZE
=
int
(
os
.
getenv
(
"PP_SIZE"
,
1
))
pytestmark
=
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--tensor-parallel-size"
,
str
(
TP_SIZE
),
"--distributed-executor-backend"
,
"ray"
,
]
if
CHUNKED_PREFILL
:
args
+=
[
"--enable-chunked-prefill"
,
]
if
EAGER_MODE
:
args
+=
[
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
tests/distributed/test_pynccl.py
View file @
705f6a35
import
multiprocessing
import
os
from
typing
import
Dict
,
List
import
pytest
import
torch
import
torch.distributed
from
vllm.distributed.communication_op
import
(
# noqa
graph_capture
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
)
from
vllm.distributed.device_communicators.pynccl
import
PyNcclCommunicator
from
vllm.distributed.device_communicators.pynccl_wrapper
import
NCCLLibrary
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
get_world_group
,
graph_capture
,
init_distributed_environment
)
from
vllm.utils
import
update_environment_variables
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
=
[]
processes
:
List
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
env
=
{}
env
:
Dict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
...
...
@@ -53,7 +55,8 @@ def worker_fn_wrapper(fn):
@
worker_fn_wrapper
def
worker_fn
():
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
with
pynccl_comm
.
change_state
(
enable
=
True
):
...
...
@@ -129,7 +132,8 @@ def test_pynccl_multiple_allreduce_with_vllm():
def
worker_fn_with_cudagraph
():
with
torch
.
no_grad
():
graph
=
torch
.
cuda
.
CUDAGraph
()
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
# run something in the default stream to initialize torch engine
a
=
torch
.
ones
((
4
,
4
),
device
=
f
'cuda:
{
pynccl_comm
.
rank
}
'
)
torch
.
cuda
.
synchronize
()
...
...
@@ -154,7 +158,8 @@ def test_pynccl_with_cudagraph():
@
worker_fn_wrapper
def
send_recv_worker_fn
():
pynccl_comm
=
PyNcclCommunicator
()
pynccl_comm
=
PyNcclCommunicator
(
get_world_group
().
cpu_group
,
device
=
get_world_group
().
device
)
if
pynccl_comm
.
rank
==
0
:
tensor
=
torch
.
ones
(
16
,
1024
,
1024
,
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
...
...
@@ -163,9 +168,13 @@ def send_recv_worker_fn():
dtype
=
torch
.
float32
).
cuda
(
pynccl_comm
.
rank
)
with
pynccl_comm
.
change_state
(
enable
=
True
):
if
pynccl_comm
.
rank
==
0
:
pynccl_comm
.
send
(
tensor
)
pynccl_comm
.
send
(
tensor
,
dst
=
(
pynccl_comm
.
rank
+
1
)
%
pynccl_comm
.
world_size
)
else
:
pynccl_comm
.
recv
(
tensor
)
pynccl_comm
.
recv
(
tensor
,
src
=
(
pynccl_comm
.
rank
-
1
)
%
pynccl_comm
.
world_size
)
result
=
tensor
.
mean
().
cpu
().
item
()
assert
result
==
1
...
...
@@ -198,9 +207,13 @@ def multiple_send_recv_worker_fn():
device
=
device
)
with
pynccl_comm
.
change_state
(
enable
=
True
):
if
torch
.
distributed
.
get_rank
()
in
[
0
,
1
]:
pynccl_comm
.
send
(
tensor
)
pynccl_comm
.
send
(
tensor
,
dst
=
(
pynccl_comm
.
rank
+
1
)
%
pynccl_comm
.
world_size
)
else
:
pynccl_comm
.
recv
(
tensor
)
pynccl_comm
.
recv
(
tensor
,
src
=
(
pynccl_comm
.
rank
-
1
)
%
pynccl_comm
.
world_size
)
result
=
tensor
.
mean
().
cpu
().
item
()
if
torch
.
distributed
.
get_rank
()
in
[
0
,
2
]:
assert
result
==
1
...
...
tests/distributed/test_same_node.py
View file @
705f6a35
...
...
@@ -2,10 +2,12 @@ import os
import
torch
from
vllm.distributed.parallel_state
import
is_
in_the_same_node
from
vllm.distributed.parallel_state
import
in_the_same_node
_as
torch
.
distributed
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
is_in_the_same_node
(
torch
.
distributed
.
group
.
WORLD
)
test_result
=
all
(
in_the_same_node_as
(
torch
.
distributed
.
group
.
WORLD
,
source_rank
=
0
))
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
print
(
"Same node test passed!"
)
tests/distributed/test_shm_broadcast.py
0 → 100644
View file @
705f6a35
import
multiprocessing
import
random
import
time
from
typing
import
List
import
numpy
as
np
import
torch.distributed
as
dist
from
vllm.distributed.device_communicators.shm_broadcast
import
MessageQueue
from
vllm.utils
import
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
List
[
np
.
ndarray
]:
np
.
random
.
seed
(
seed
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
# on average, each array will have 5k elements
# with int64, each array will have 40kb
return
[
np
.
random
.
randint
(
1
,
100
,
i
)
for
i
in
sizes
]
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
=
[]
for
i
in
range
(
number_of_processes
):
env
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'LOCAL_WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'MASTER_ADDR'
]
=
'localhost'
env
[
'MASTER_PORT'
]
=
'12345'
p
=
multiprocessing
.
Process
(
target
=
fn
,
args
=
(
env
,
))
processes
.
append
(
p
)
p
.
start
()
for
p
in
processes
:
p
.
join
()
for
p
in
processes
:
assert
p
.
exitcode
==
0
def
worker_fn_wrapper
(
fn
):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def
wrapped_fn
(
env
):
update_environment_variables
(
env
)
dist
.
init_process_group
(
backend
=
"gloo"
)
fn
()
return
wrapped_fn
@
worker_fn_wrapper
def
worker_fn
():
writer_rank
=
2
broadcaster
=
MessageQueue
.
create_from_process_group
(
dist
.
group
.
WORLD
,
40
*
1024
,
2
,
writer_rank
)
if
dist
.
get_rank
()
==
writer_rank
:
seed
=
random
.
randint
(
0
,
1000
)
dist
.
broadcast_object_list
([
seed
],
writer_rank
)
else
:
recv
=
[
None
]
dist
.
broadcast_object_list
(
recv
,
writer_rank
)
seed
=
recv
[
0
]
# type: ignore
dist
.
barrier
()
# in case we find a race condition
# print the seed so that we can reproduce the error
print
(
f
"Rank
{
dist
.
get_rank
()
}
got seed
{
seed
}
"
)
# test broadcasting with about 400MB of data
N
=
10_000
if
dist
.
get_rank
()
==
writer_rank
:
arrs
=
get_arrays
(
N
,
seed
)
for
x
in
arrs
:
broadcaster
.
broadcast_object
(
x
)
time
.
sleep
(
random
.
random
()
/
1000
)
else
:
arrs
=
get_arrays
(
N
,
seed
)
for
x
in
arrs
:
y
=
broadcaster
.
broadcast_object
(
None
)
assert
np
.
array_equal
(
x
,
y
)
time
.
sleep
(
random
.
random
()
/
1000
)
dist
.
barrier
()
def
test_shm_broadcast
():
distributed_run
(
worker_fn
,
4
)
tests/distributed/test_utils.py
0 → 100644
View file @
705f6a35
import
ray
import
vllm.envs
as
envs
from
vllm.utils
import
(
cuda_device_count_stateless
,
update_environment_variables
)
@
ray
.
remote
class
_CUDADeviceCountStatelessTestActor
:
def
get_count
(
self
):
return
cuda_device_count_stateless
()
def
set_cuda_visible_devices
(
self
,
cuda_visible_devices
:
str
):
update_environment_variables
(
{
"CUDA_VISIBLE_DEVICES"
:
cuda_visible_devices
})
def
get_cuda_visible_devices
(
self
):
return
envs
.
CUDA_VISIBLE_DEVICES
def
test_cuda_device_count_stateless
():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
actor
=
_CUDADeviceCountStatelessTestActor
.
options
(
# type: ignore
num_gpus
=
2
).
remote
()
assert
sorted
(
ray
.
get
(
actor
.
get_cuda_visible_devices
.
remote
()).
split
(
","
))
==
[
"0"
,
"1"
]
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
2
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
"0"
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
1
ray
.
get
(
actor
.
set_cuda_visible_devices
.
remote
(
""
))
assert
ray
.
get
(
actor
.
get_count
.
remote
())
==
0
tests/engine/output_processor/test_multi_step.py
View file @
705f6a35
...
...
@@ -32,7 +32,7 @@ def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
stop_checker
=
stop_checker
,
...
...
@@ -86,7 +86,7 @@ def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(),
stop_checker
=
stop_checker
,
...
...
@@ -148,7 +148,7 @@ def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
stop_checker
=
stop_checker
,
...
...
@@ -215,7 +215,7 @@ def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
output_processor
=
MultiStepOutputProcessor
(
detokenizer
=
detokenizer
,
scheduler
=
scheduler
,
scheduler
=
[
scheduler
]
,
seq_counter
=
seq_counter
,
get_tokenizer_for_seq
=
lambda
_
:
mock_tokenizer
(
eos_token_id
),
stop_checker
=
stop_checker
,
...
...
tests/entrypoints/llm/__init__.py
0 → 100644
View file @
705f6a35
tests/entrypoints/test
_llm
_encode.py
→
tests/entrypoints/
llm/
test_encode.py
View file @
705f6a35
...
...
@@ -5,7 +5,7 @@ import pytest
from
vllm
import
LLM
,
EmbeddingRequestOutput
,
PoolingParams
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
...
...
@@ -25,8 +25,6 @@ TOKEN_IDS = [
[
1000
,
1003
,
1001
,
1002
],
]
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
...
...
tests/entrypoints/test
_llm
_generate.py
→
tests/entrypoints/
llm/
test_generate.py
View file @
705f6a35
...
...
@@ -5,7 +5,7 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"facebook/opt-125m"
...
...
@@ -23,8 +23,6 @@ TOKEN_IDS = [
[
0
,
3
,
1
,
2
],
]
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment