Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
67b4221a
Unverified
Commit
67b4221a
authored
Apr 11, 2024
by
SangBin Cho
Committed by
GitHub
Apr 10, 2024
Browse files
[Core][5/N] Fully working chunked prefill e2e (#3884)
parent
63e7176f
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
261 additions
and
90 deletions
+261
-90
vllm/distributed/communication_op.py
vllm/distributed/communication_op.py
+9
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+2
-3
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+4
-1
vllm/lora/layers.py
vllm/lora/layers.py
+3
-2
vllm/sequence.py
vllm/sequence.py
+2
-1
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+241
-82
No files found.
vllm/distributed/communication_op.py
View file @
67b4221a
...
...
@@ -173,10 +173,18 @@ def broadcast_tensor_dict(
torch
.
distributed
.
broadcast_object_list
([
metadata_list
],
src
=
src
,
group
=
group
)
async_handles
=
[]
for
key
,
value
in
metadata_list
:
if
isinstance
(
value
,
TensorMetadata
):
tensor
=
tensor_dict
[
key
]
torch
.
distributed
.
broadcast
(
tensor
,
src
=
src
,
group
=
group
)
async_handles
.
append
(
torch
.
distributed
.
broadcast
(
tensor
,
src
=
src
,
group
=
group
,
async_op
=
True
))
for
async_handle
in
async_handles
:
async_handle
.
wait
()
else
:
recv_metadata_list
=
[
None
]
torch
.
distributed
.
broadcast_object_list
(
recv_metadata_list
,
...
...
vllm/engine/arg_utils.py
View file @
67b4221a
...
...
@@ -386,9 +386,8 @@ class EngineArgs:
'prompt latency) before scheduling next prompt.'
)
parser
.
add_argument
(
'--enable-chunked-prefill'
,
type
=
bool
,
default
=
False
,
help
=
'If True, the prefill requests can be chunked based on the '
action
=
'store_true'
,
help
=
'If set, the prefill requests can be chunked based on the '
'max_num_batched_tokens'
)
parser
.
add_argument
(
...
...
vllm/engine/llm_engine.py
View file @
67b4221a
...
...
@@ -633,7 +633,10 @@ class LLMEngine:
seq_group
=
scheduled_seq_group
.
seq_group
seq_group
.
update_num_computed_tokens
(
scheduled_seq_group
.
token_chunk_size
)
self
.
_process_sequence_group_outputs
(
seq_group
,
outputs
)
# If uncomputed tokens > 0, it means prefill is chunked.
# We don't need to process outputs in that case.
if
seq_group
.
get_num_uncomputed_tokens
()
==
0
:
self
.
_process_sequence_group_outputs
(
seq_group
,
outputs
)
# Free the finished sequence groups.
self
.
scheduler
.
free_finished_seq_groups
()
...
...
vllm/lora/layers.py
View file @
67b4221a
...
...
@@ -267,12 +267,13 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
added_tokens_mask
=
x
>
self
.
base_layer
.
org_vocab_size
-
1
indices
=
self
.
embeddings_indices
[
1
][:
self
.
indices_len
[
3
]].
view_as
(
x
)
embedding_len
=
self
.
indices_len
[
3
]
indices
=
self
.
embeddings_indices
[
1
][:
embedding_len
].
view_as
(
x
)
full_lora_a_embeddings
=
F
.
embedding
(
x
+
indices
,
self
.
lora_a_stacked_2d
,
)
indices
=
self
.
embeddings_indices
[
0
][:
self
.
indices
_len
[
3
]
].
view_as
(
x
)
indices
=
self
.
embeddings_indices
[
0
][:
embedding
_len
].
view_as
(
x
)
full_output
=
self
.
base_layer
.
forward
(
x
.
add_
(
indices
*
added_tokens_mask
))
...
...
vllm/sequence.py
View file @
67b4221a
...
...
@@ -500,7 +500,8 @@ class SequenceGroup:
def
get_num_uncomputed_tokens
(
self
)
->
int
:
num_uncomputed_tokens
=
0
for
seq
in
self
.
get_seqs
():
num_uncomputed_tokens
+=
seq
.
data
.
get_num_uncomputed_tokens
()
if
not
seq
.
is_finished
():
num_uncomputed_tokens
+=
seq
.
data
.
get_num_uncomputed_tokens
()
return
num_uncomputed_tokens
def
num_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
)
->
int
:
...
...
vllm/worker/model_runner.py
View file @
67b4221a
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment