Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e8671783
"vscode:/vscode.git/clone" did not exist on "c12f1c5b99c9d9f9388f464aa77063987fdb8f0f"
Unverified
Commit
e8671783
authored
May 23, 2023
by
Woosuk Kwon
Committed by
GitHub
May 23, 2023
Browse files
Incrementally decode output tokens (#121)
parent
aedba6d5
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
83 additions
and
17 deletions
+83
-17
cacheflow/core/scheduler.py
cacheflow/core/scheduler.py
+1
-1
cacheflow/sequence.py
cacheflow/sequence.py
+8
-3
cacheflow/server/llm_server.py
cacheflow/server/llm_server.py
+12
-12
cacheflow/server/tokenizer_utils.py
cacheflow/server/tokenizer_utils.py
+62
-1
No files found.
cacheflow/core/scheduler.py
View file @
e8671783
...
...
@@ -291,7 +291,7 @@ class Scheduler:
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
# Append a new token to the sequence.
output
=
seq_outputs
[
seq
.
seq_id
]
seq
.
append_token
(
output
.
output_token
,
output
.
logprobs
)
seq
.
append_token
_id
(
output
.
output_token
,
output
.
logprobs
)
return
self
.
running
.
copy
()
def
free_seq
(
self
,
seq
:
Sequence
)
->
None
:
...
...
cacheflow/sequence.py
View file @
e8671783
...
...
@@ -24,7 +24,7 @@ class SequenceData:
self
.
output_token_ids
:
List
[
int
]
=
[]
self
.
cumulative_logprob
=
0.0
def
append_token
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
def
append_token
_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
self
.
output_token_ids
.
append
(
token_id
)
self
.
cumulative_logprob
+=
logprob
...
...
@@ -64,6 +64,7 @@ class Sequence:
self
.
data
=
SequenceData
(
prompt_token_ids
)
self
.
output_logprobs
:
List
[
Dict
[
int
,
float
]]
=
[]
self
.
output_tokens
:
List
[
str
]
=
[]
self
.
output_text
=
""
self
.
logical_token_blocks
:
List
[
LogicalTokenBlock
]
=
[]
...
...
@@ -92,11 +93,15 @@ class Sequence:
last_block
.
append_tokens
(
token_ids
[:
num_empty_slots
])
token_ids
=
token_ids
[
num_empty_slots
:]
def
append_token
(
self
,
token_id
:
int
,
logprobs
:
Dict
[
int
,
float
])
->
None
:
def
append_token_id
(
self
,
token_id
:
int
,
logprobs
:
Dict
[
int
,
float
],
)
->
None
:
assert
token_id
in
logprobs
self
.
_append_tokens_to_blocks
([
token_id
])
self
.
output_logprobs
.
append
(
logprobs
)
self
.
data
.
append_token
(
token_id
,
logprobs
[
token_id
])
self
.
data
.
append_token
_id
(
token_id
,
logprobs
[
token_id
])
def
get_len
(
self
)
->
int
:
return
self
.
data
.
get_len
()
...
...
cacheflow/server/llm_server.py
View file @
e8671783
...
...
@@ -14,7 +14,8 @@ from cacheflow.outputs import RequestOutput
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.server.arg_utils
import
ServerArgs
from
cacheflow.server.ray_utils
import
initialize_cluster
from
cacheflow.server.tokenizer_utils
import
get_tokenizer
from
cacheflow.server.tokenizer_utils
import
(
get_tokenizer
,
detokenize_incrementally
)
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
cacheflow.utils
import
Counter
from
cacheflow.worker.worker
import
Worker
...
...
@@ -185,18 +186,17 @@ class LLMServer:
return
request_outputs
def
_decode_sequences
(
self
,
seq_groups
:
List
[
SequenceGroup
])
->
None
:
# Batch-decode the sequence outputs.
seqs
:
List
[
Sequence
]
=
[]
# Decode the sequence outputs.
for
seq_group
in
seq_groups
:
seqs
.
extend
(
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
)
output_tokens_per_seq
=
[]
for
seq
in
seqs
:
output_tokens_per_seq
.
append
(
seq
.
get_
output_token
_ids
())
output_texts
=
self
.
tokenizer
.
batch_decode
(
output_tokens_per_seq
,
skip_special_tokens
=
True
)
# Update the sequences with the output texts.
for
seq
,
output_t
ext
in
zip
(
seqs
,
output_texts
):
seq
.
output_text
=
output_text
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
:
new_token
,
new_output_text
=
detokenize_incrementally
(
self
.
tokenizer
,
seq
.
output_token
s
,
seq
.
get_last_token_id
()
,
skip_special_tokens
=
True
,
)
seq
.
output_t
okens
.
append
(
new_token
)
seq
.
output_text
=
new_
output_text
def
_stop_sequences
(
self
,
seq_groups
:
List
[
SequenceGroup
])
->
None
:
# Stop the sequences.
...
...
cacheflow/server/tokenizer_utils.py
View file @
e8671783
from
typing
import
Union
from
typing
import
List
,
Tuple
,
Union
from
transformers
import
(
AutoConfig
,
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
from
cacheflow.logger
import
init_logger
logger
=
init_logger
(
__name__
)
_MODEL_TYPES_WITH_SLOW_TOKENIZER
=
[
# LLaMA fast tokenizer has a bug related to protobuf.
# See https://github.com/WoosukKwon/cacheflow/issues/80#issue-1698550554
...
...
@@ -17,5 +21,62 @@ def get_tokenizer(
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
config
=
AutoConfig
.
from_pretrained
(
model_name
)
if
config
.
model_type
in
_MODEL_TYPES_WITH_SLOW_TOKENIZER
:
if
getattr
(
kwargs
,
"use_fast"
,
False
)
==
True
:
raise
ValueError
(
f
"Cannot use the fast tokenizer for
{
config
.
model_type
}
due to "
"bugs in the fast tokenizer."
)
logger
.
info
(
f
"Using the slow tokenizer for
{
config
.
model_type
}
due to bugs in "
"the fast tokenizer. This could potentially lead to performance "
"degradation."
)
kwargs
[
"use_fast"
]
=
False
return
AutoTokenizer
.
from_pretrained
(
model_name
,
*
args
,
**
kwargs
)
def
detokenize_incrementally
(
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
prev_output_tokens
:
List
[
str
],
new_token_id
:
int
,
skip_special_tokens
:
bool
,
)
->
Tuple
[
str
,
str
]:
"""Detokenizes the new token in conjuction with the previous output tokens.
NOTE: This function does not update prev_output_tokens.
Returns:
new_token: The new token as a string.
output_text: The new output text as a string.
"""
new_token
=
tokenizer
.
convert_ids_to_tokens
(
new_token_id
,
skip_special_tokens
=
skip_special_tokens
)
output_tokens
=
prev_output_tokens
+
[
new_token
]
# Convert the tokens to a string.
# Optimization: If the tokenizer does not have `added_tokens_encoder`,
# then we can directly use `convert_tokens_to_string`.
if
not
getattr
(
tokenizer
,
"added_tokens_encoder"
,
{}):
output_text
=
tokenizer
.
convert_tokens_to_string
(
output_tokens
)
return
new_token
,
output_text
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
sub_texts
=
[]
current_sub_text
=
[]
for
token
in
output_tokens
:
if
skip_special_tokens
and
token
in
tokenizer
.
all_special_ids
:
continue
if
token
in
tokenizer
.
added_tokens_encoder
:
if
current_sub_text
:
sub_text
=
tokenizer
.
convert_tokens_to_string
(
current_sub_text
)
sub_texts
.
append
(
sub_text
)
current_sub_text
=
[]
sub_texts
.
append
(
token
)
else
:
current_sub_text
.
append
(
token
)
if
current_sub_text
:
sub_text
=
tokenizer
.
convert_tokens_to_string
(
current_sub_text
)
sub_texts
.
append
(
sub_text
)
output_text
=
" "
.
join
(
sub_texts
)
return
new_token
,
output_text
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment