Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
25aeb7d4
Unverified
Commit
25aeb7d4
authored
Oct 18, 2024
by
Nick Hill
Committed by
GitHub
Oct 18, 2024
Browse files
[BugFix] Fix and simplify completion API usage streaming (#9475)
parent
d2b1bf55
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
61 additions
and
62 deletions
+61
-62
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+61
-62
No files found.
vllm/entrypoints/openai/serving_completion.py
View file @
25aeb7d4
...
@@ -258,6 +258,14 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -258,6 +258,14 @@ class OpenAIServingCompletion(OpenAIServing):
has_echoed
=
[
False
]
*
num_choices
*
num_prompts
has_echoed
=
[
False
]
*
num_choices
*
num_prompts
num_prompt_tokens
=
[
0
]
*
num_prompts
num_prompt_tokens
=
[
0
]
*
num_prompts
stream_options
=
request
.
stream_options
if
stream_options
:
include_usage
=
stream_options
.
include_usage
include_continuous_usage
=
include_usage
and
\
stream_options
.
continuous_usage_stats
else
:
include_usage
,
include_continuous_usage
=
False
,
False
try
:
try
:
async
for
prompt_idx
,
res
in
result_generator
:
async
for
prompt_idx
,
res
in
result_generator
:
prompt_token_ids
=
res
.
prompt_token_ids
prompt_token_ids
=
res
.
prompt_token_ids
...
@@ -276,28 +284,25 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -276,28 +284,25 @@ class OpenAIServingCompletion(OpenAIServing):
i
=
output
.
index
+
prompt_idx
*
num_choices
i
=
output
.
index
+
prompt_idx
*
num_choices
assert
request
.
max_tokens
is
not
None
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
request
.
max_tokens
==
0
:
if
request
.
echo
and
not
has_echoed
[
i
]
:
assert
prompt_token_ids
is
not
None
assert
prompt_token_ids
is
not
None
assert
prompt_text
is
not
None
assert
prompt_text
is
not
None
# only return the prompt
if
request
.
max_tokens
==
0
:
delta_text
=
prompt_text
# only return the prompt
delta_token_ids
=
prompt_token_ids
delta_text
=
prompt_text
out_logprobs
=
prompt_logprobs
delta_token_ids
=
prompt_token_ids
has_echoed
[
i
]
=
True
out_logprobs
=
prompt_logprobs
elif
(
request
.
echo
and
request
.
max_tokens
>
0
else
:
and
not
has_echoed
[
i
]):
assert
prompt_logprobs
is
not
None
assert
prompt_token_ids
is
not
None
# echo the prompt and first token
assert
prompt_text
is
not
None
delta_text
=
prompt_text
+
output
.
text
assert
prompt_logprobs
is
not
None
delta_token_ids
=
[
# echo the prompt and first token
*
prompt_token_ids
,
*
output
.
token_ids
delta_text
=
prompt_text
+
output
.
text
]
delta_token_ids
=
[
out_logprobs
=
[
*
prompt_token_ids
,
*
output
.
token_ids
*
prompt_logprobs
,
]
*
(
output
.
logprobs
or
[]),
out_logprobs
=
[
]
*
prompt_logprobs
,
*
(
output
.
logprobs
or
[]),
]
has_echoed
[
i
]
=
True
has_echoed
[
i
]
=
True
else
:
else
:
# return just the delta
# return just the delta
...
@@ -341,45 +346,39 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -341,45 +346,39 @@ class OpenAIServingCompletion(OpenAIServing):
stop_reason
=
stop_reason
,
stop_reason
=
stop_reason
,
)
)
])
])
if
(
request
.
stream_options
if
include_continuous_usage
:
and
request
.
stream_options
.
include_usage
):
prompt_tokens
=
num_prompt_tokens
[
prompt_idx
]
if
(
request
.
stream_options
.
continuous_usage_stats
completion_tokens
=
previous_num_tokens
[
i
]
or
output
.
finish_reason
is
not
None
):
chunk
.
usage
=
UsageInfo
(
prompt_tokens
=
num_prompt_tokens
[
prompt_idx
]
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
]
completion_tokens
=
completion_tokens
,
usage
=
UsageInfo
(
total_tokens
=
prompt_tokens
+
completion_tokens
,
prompt_tokens
=
prompt_tokens
,
)
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
if
request
.
stream_options
.
continuous_usage_stats
:
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
False
)
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
False
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
f
"data:
{
response_json
}
\n\n
"
if
(
request
.
stream_options
total_prompt_tokens
=
sum
(
num_prompt_tokens
)
and
request
.
stream_options
.
include_usage
):
total_completion_tokens
=
sum
(
previous_num_tokens
)
final_usage_info
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
)
if
include_usage
:
final_usage_chunk
=
CompletionStreamResponse
(
final_usage_chunk
=
CompletionStreamResponse
(
id
=
request_id
,
id
=
request_id
,
created
=
created_time
,
created
=
created_time
,
model
=
model_name
,
model
=
model_name
,
choices
=
[],
choices
=
[],
usage
=
usage
,
usage
=
final_usage_info
,
)
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
False
,
exclude_none
=
True
))
exclude_unset
=
False
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
yield
f
"data:
{
final_usage_data
}
\n\n
"
# report to FastAPI middleware aggregate usage across all choices
# report to FastAPI middleware aggregate usage across all choices
total_prompt_tokens
=
sum
(
num_prompt_tokens
)
request_metadata
.
final_usage_info
=
final_usage_info
total_completion_tokens
=
sum
(
previous_num_tokens
)
request_metadata
.
final_usage_info
=
UsageInfo
(
prompt_tokens
=
total_prompt_tokens
,
completion_tokens
=
total_completion_tokens
,
total_tokens
=
total_prompt_tokens
+
total_completion_tokens
)
except
ValueError
as
e
:
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
# TODO: Use a vllm-specific Validation Error
...
@@ -413,26 +412,26 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -413,26 +412,26 @@ class OpenAIServingCompletion(OpenAIServing):
for
output
in
final_res
.
outputs
:
for
output
in
final_res
.
outputs
:
assert
request
.
max_tokens
is
not
None
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
request
.
max_tokens
==
0
:
if
request
.
echo
:
assert
prompt_text
is
not
None
token_ids
=
prompt_token_ids
out_logprobs
=
prompt_logprobs
output_text
=
prompt_text
elif
request
.
echo
and
request
.
max_tokens
>
0
:
assert
prompt_text
is
not
None
assert
prompt_text
is
not
None
token_ids
=
[
*
prompt_token_ids
,
*
output
.
token_ids
]
if
request
.
max_tokens
==
0
:
token_ids
=
prompt_token_ids
if
request
.
logprobs
is
None
:
out_logprobs
=
prompt_logprobs
out
_logprobs
=
None
out
put_text
=
prompt_text
else
:
else
:
assert
prompt_logprobs
is
not
None
token_ids
=
[
*
prompt_token_ids
,
*
output
.
token_ids
]
assert
output
.
logprobs
is
not
None
out_logprobs
=
[
if
request
.
logprobs
is
None
:
*
prompt_logprobs
,
out_logprobs
=
None
*
output
.
logprobs
,
else
:
]
assert
prompt_logprobs
is
not
None
assert
output
.
logprobs
is
not
None
output_text
=
prompt_text
+
output
.
text
out_logprobs
=
[
*
prompt_logprobs
,
*
output
.
logprobs
,
]
output_text
=
prompt_text
+
output
.
text
else
:
else
:
token_ids
=
output
.
token_ids
token_ids
=
output
.
token_ids
out_logprobs
=
output
.
logprobs
out_logprobs
=
output
.
logprobs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment