Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0b4997e0
Unverified
Commit
0b4997e0
authored
Mar 25, 2024
by
Dylan Hawk
Committed by
GitHub
Mar 25, 2024
Browse files
[Bugfix] API stream returning two stops (#3450)
Co-authored-by:
Dylan Hawk
<
dylanwawk@gmail.com
>
parent
c13ad1b7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
27 deletions
+25
-27
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+12
-0
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+13
-27
No files found.
tests/entrypoints/test_openai_server.py
View file @
0b4997e0
...
@@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
...
@@ -322,9 +322,15 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
)
stream
=
True
)
chunks
=
[]
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
chunk
.
usage
==
single_usage
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
assert
""
.
join
(
chunks
)
==
single_output
...
@@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
...
@@ -363,13 +369,19 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
stream
=
True
,
stream
=
True
,
)
)
chunks
=
[]
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
assert
""
.
join
(
chunks
)
==
output
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
0b4997e0
...
@@ -266,23 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -266,23 +266,7 @@ class OpenAIServingCompletion(OpenAIServing):
previous_texts
[
i
]
=
output
.
text
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
finish_reason
=
output
.
finish_reason
finish_reason
=
output
.
finish_reason
response_json
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[
CompletionResponseStreamChoice
(
index
=
i
,
text
=
delta_text
,
logprobs
=
logprobs
,
finish_reason
=
finish_reason
,
)
]).
model_dump_json
()
yield
f
"data:
{
response_json
}
\n\n
"
if
output
.
finish_reason
is
not
None
:
# return final usage
if
output
.
finish_reason
is
not
None
:
# return final usage
logprobs
=
LogProbs
(
)
if
request
.
logprobs
is
not
None
else
None
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
final_usage
=
UsageInfo
(
final_usage
=
UsageInfo
(
...
@@ -290,6 +274,8 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -290,6 +274,8 @@ class OpenAIServingCompletion(OpenAIServing):
completion_tokens
=
completion_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
)
else
:
final_usage
=
None
response_json
=
CompletionStreamResponse
(
response_json
=
CompletionStreamResponse
(
id
=
request_id
,
id
=
request_id
,
created
=
created_time
,
created
=
created_time
,
...
@@ -297,13 +283,13 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -297,13 +283,13 @@ class OpenAIServingCompletion(OpenAIServing):
choices
=
[
choices
=
[
CompletionResponseStreamChoice
(
CompletionResponseStreamChoice
(
index
=
i
,
index
=
i
,
text
=
""
,
text
=
delta_text
,
logprobs
=
logprobs
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
finish_reason
=
finish_reason
,
)
)
],
],
usage
=
final_usage
,
usage
=
final_usage
,
).
model_dump_json
()
).
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
f
"data:
{
response_json
}
\n\n
"
except
ValueError
as
e
:
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
# TODO: Use a vllm-specific Validation Error
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment