Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f67ee8b8
Unverified
Commit
f67ee8b8
authored
Feb 04, 2026
by
Chauncey
Committed by
GitHub
Feb 04, 2026
Browse files
[Perf] Optimize chat completion streaming performance (#33782)
Signed-off-by:
chaunceyjiang
<
chaunceyjiang@gmail.com
>
parent
e57ef99b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
14 deletions
+17
-14
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/serving.py
+17
-14
No files found.
vllm/entrypoints/openai/chat_completion/serving.py
View file @
f67ee8b8
...
...
@@ -679,6 +679,7 @@ class OpenAIServingChat(OpenAIServing):
# For reasoning parser and tool call all enabled
added_content_delta_arr
=
[
False
]
*
num_choices
reasoning_end_arr
=
[
False
]
*
num_choices
prompt_is_reasoning_end_arr
:
list
[
bool
|
None
]
=
[
None
]
*
num_choices
else
:
all_previous_token_ids
=
None
...
...
@@ -824,6 +825,16 @@ class OpenAIServingChat(OpenAIServing):
i
=
output
.
index
tool_parser
=
tool_parsers
[
i
]
if
(
self
.
reasoning_parser
and
res
.
prompt_token_ids
and
prompt_is_reasoning_end_arr
[
i
]
is
None
):
# only check once per choice, because prompt_token_ids
# are the same for all deltas in that choice
prompt_is_reasoning_end_arr
[
i
]
=
(
reasoning_parser
.
is_reasoning_end
(
res
.
prompt_token_ids
)
)
if
finish_reason_sent
[
i
]:
continue
...
...
@@ -926,13 +937,11 @@ class OpenAIServingChat(OpenAIServing):
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Only keep 'content', remove 'reasoning'.
if
reasoning_parser
.
is_reasoning_end
(
if
(
reasoning_parser
.
is_reasoning_end
(
as_list
(
output
.
token_ids
)
)
or
(
res
.
prompt_token_ids
and
reasoning_parser
.
is_reasoning_end
(
res
.
prompt_token_ids
)
or
prompt_is_reasoning_end_arr
[
i
]
):
reasoning_end_arr
[
i
]
=
True
if
delta_message
and
delta_message
.
content
:
...
...
@@ -991,8 +1000,7 @@ class OpenAIServingChat(OpenAIServing):
if
(
self
.
reasoning_parser
is
not
None
and
not
reasoning_end_arr
[
i
]
and
res
.
prompt_token_ids
and
reasoning_parser
.
is_reasoning_end
(
res
.
prompt_token_ids
)
and
prompt_is_reasoning_end_arr
[
i
]
):
reasoning_end_arr
[
i
]
=
True
...
...
@@ -1049,12 +1057,7 @@ class OpenAIServingChat(OpenAIServing):
# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
# set reasoning status to end.
if
(
res
.
prompt_token_ids
and
reasoning_parser
.
is_reasoning_end
(
res
.
prompt_token_ids
)
):
if
prompt_is_reasoning_end_arr
[
i
]:
reasoning_end_arr
[
i
]
=
True
current_token_ids
=
output_token_ids
# Don't update current_text, keep it as is from delta
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment