Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9ed82e70
Unverified
Commit
9ed82e70
authored
Jul 19, 2024
by
Antoni Baum
Committed by
GitHub
Jul 19, 2024
Browse files
[Misc] Small perf improvements (#6520)
parent
51f8aa90
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
46 additions
and
23 deletions
+46
-23
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+7
-4
tests/core/block/test_cpu_gpu_block_allocator.py
tests/core/block/test_cpu_gpu_block_allocator.py
+4
-4
vllm/core/block/block_table.py
vllm/core/block/block_table.py
+14
-5
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+4
-1
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+10
-4
vllm/sequence.py
vllm/sequence.py
+4
-3
vllm/utils.py
vllm/utils.py
+3
-2
No files found.
tests/core/block/test_block_manager_v2.py
View file @
9ed82e70
...
...
@@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks
=
len
(
list
(
chunk_list
(
list
(
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
))
-
len
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
))
range
(
prompt_len
+
num_slots_to_append
+
num_lookahead_slots
)),
block_size
)))
-
len
(
list
(
chunk_list
(
list
(
range
(
prompt_len
)),
block_size
)))
assert
num_consumed_blocks
==
expected_consumed_blocks
...
...
tests/core/block/test_cpu_gpu_block_allocator.py
View file @
9ed82e70
...
...
@@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
unique_token_ids
=
list
(
range
((
num_cpu_blocks
+
num_gpu_blocks
)
*
block_size
))
gpu_token_ids
=
chunk_list
(
unique_token_ids
[:
num_gpu_blocks
*
block_size
],
block_size
)
cpu_token_ids
=
chunk_list
(
unique_token_ids
[
num_gpu_blocks
*
block_size
:],
block_size
)
gpu_token_ids
=
list
(
chunk_list
(
unique_token_ids
[:
num_gpu_blocks
*
block_size
],
block_size
)
)
cpu_token_ids
=
list
(
chunk_list
(
unique_token_ids
[
num_gpu_blocks
*
block_size
:],
block_size
)
)
assert
allocator
.
get_num_free_blocks
(
Device
.
CPU
)
==
num_cpu_blocks
assert
allocator
.
get_num_free_blocks
(
Device
.
GPU
)
==
num_gpu_blocks
...
...
vllm/core/block/block_table.py
View file @
9ed82e70
import
math
from
typing
import
List
,
Optional
from
vllm.core.block.common
import
BlockList
...
...
@@ -337,10 +338,17 @@ class BlockTable:
This is required for the scheduler to determine whether a sequence can
continue generation, or if it must be preempted.
"""
# Math below is equivalent to:
# all_token_ids = token_ids + [-1] * num_lookahead_slots
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
# return len(token_blocks)
all_token_ids
=
token_ids
+
[
-
1
]
*
num_lookahead_slots
token_blocks
=
self
.
_chunk_token_blocks_for_append
(
all_token_ids
)
return
len
(
token_blocks
)
num_token_ids
=
len
(
token_ids
)
+
num_lookahead_slots
first_chunk_size
=
self
.
_block_size
-
(
self
.
_num_full_slots
%
self
.
_block_size
)
num_token_blocks
=
(
1
+
math
.
ceil
(
(
num_token_ids
-
first_chunk_size
)
/
self
.
_block_size
))
return
num_token_blocks
def
_chunk_token_blocks_for_append
(
self
,
token_ids
:
List
[
int
])
->
List
[
List
[
int
]]:
...
...
@@ -351,6 +359,7 @@ class BlockTable:
"""
first_chunk_size
=
self
.
_block_size
-
(
self
.
_num_full_slots
%
self
.
_block_size
)
token_blocks
=
[
token_ids
[:
first_chunk_size
]]
+
chunk_list
(
token_ids
[
first_chunk_size
:],
self
.
_block_size
)
token_blocks
=
[
token_ids
[:
first_chunk_size
]]
token_blocks
.
extend
(
chunk_list
(
token_ids
[
first_chunk_size
:],
self
.
_block_size
))
return
token_blocks
vllm/core/block/prefix_caching_block.py
View file @
9ed82e70
...
...
@@ -552,9 +552,12 @@ class PrefixCachingBlockAllocator(BlockAllocator):
# runner.
# It returns a list of int although type annotation says list of string.
if
len
(
computed_seq_block_ids
)
==
1
:
return
computed_seq_block_ids
[
0
]
return
commonprefix
([
ids
for
ids
in
computed_seq_block_ids
# type: ignore
if
ids
!=
[]
if
ids
])
def
get_num_blocks_touched
(
self
,
...
...
vllm/model_executor/models/__init__.py
View file @
9ed82e70
import
functools
import
importlib
from
typing
import
Dict
,
List
,
Optional
,
Type
...
...
@@ -98,6 +99,14 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
class
ModelRegistry
:
@
staticmethod
@
functools
.
lru_cache
(
maxsize
=
128
)
def
_get_model
(
model_arch
:
str
):
module_name
,
model_cls_name
=
_MODELS
[
model_arch
]
module
=
importlib
.
import_module
(
f
"vllm.model_executor.models.
{
module_name
}
"
)
return
getattr
(
module
,
model_cls_name
,
None
)
@
staticmethod
def
load_model_cls
(
model_arch
:
str
)
->
Optional
[
Type
[
nn
.
Module
]]:
if
model_arch
in
_OOT_MODELS
:
...
...
@@ -114,10 +123,7 @@ class ModelRegistry:
"Model architecture %s is partially supported by ROCm: %s"
,
model_arch
,
_ROCM_PARTIALLY_SUPPORTED_MODELS
[
model_arch
])
module_name
,
model_cls_name
=
_MODELS
[
model_arch
]
module
=
importlib
.
import_module
(
f
"vllm.model_executor.models.
{
module_name
}
"
)
return
getattr
(
module
,
model_cls_name
,
None
)
return
ModelRegistry
.
_get_model
(
model_arch
)
@
staticmethod
def
get_supported_archs
()
->
List
[
str
]:
...
...
vllm/sequence.py
View file @
9ed82e70
...
...
@@ -457,24 +457,25 @@ class SequenceGroup:
self
.
prompt_adapter_request
=
prompt_adapter_request
self
.
encoder_seq
=
encoder_seq
self
.
trace_headers
=
trace_headers
self
.
_first_seq
=
next
(
iter
(
self
.
seqs_dict
.
values
()))
@
property
def
prompt
(
self
)
->
Optional
[
str
]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
()))
.
prompt
return
self
.
_first_seq
.
prompt
@
property
def
prompt_token_ids
(
self
)
->
List
[
int
]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
()))
.
prompt_token_ids
return
self
.
_first_seq
.
prompt_token_ids
@
property
def
multi_modal_data
(
self
)
->
"MultiModalDataDict"
:
# All sequences in the group should have the same multi-modal data.
# We use the multi-modal data of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
()))
.
multi_modal_data
return
self
.
_first_seq
.
multi_modal_data
@
property
def
lora_int_id
(
self
)
->
int
:
...
...
vllm/utils.py
View file @
9ed82e70
...
...
@@ -415,9 +415,10 @@ def init_kmp_env():
os
.
environ
[
'KMP_REDUCTION_BARRIER_PATTERN'
]
=
"dist,dist"
def
chunk_list
(
lst
:
List
[
T
],
chunk_size
:
int
)
->
List
[
List
[
T
]]
:
def
chunk_list
(
lst
:
List
[
T
],
chunk_size
:
int
):
"""Yield successive chunk_size chunks from lst."""
return
[
lst
[
i
:
i
+
chunk_size
]
for
i
in
range
(
0
,
len
(
lst
),
chunk_size
)]
for
i
in
range
(
0
,
len
(
lst
),
chunk_size
):
yield
lst
[
i
:
i
+
chunk_size
]
def
cdiv
(
a
:
int
,
b
:
int
)
->
int
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment