Unverified Commit f67ee8b8 authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Perf] Optimize chat completion streaming performance (#33782)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent e57ef99b
...@@ -679,6 +679,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -679,6 +679,7 @@ class OpenAIServingChat(OpenAIServing):
# For reasoning parser and tool call all enabled # For reasoning parser and tool call all enabled
added_content_delta_arr = [False] * num_choices added_content_delta_arr = [False] * num_choices
reasoning_end_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices
prompt_is_reasoning_end_arr: list[bool | None] = [None] * num_choices
else: else:
all_previous_token_ids = None all_previous_token_ids = None
...@@ -824,6 +825,16 @@ class OpenAIServingChat(OpenAIServing): ...@@ -824,6 +825,16 @@ class OpenAIServingChat(OpenAIServing):
i = output.index i = output.index
tool_parser = tool_parsers[i] tool_parser = tool_parsers[i]
if (
self.reasoning_parser
and res.prompt_token_ids
and prompt_is_reasoning_end_arr[i] is None
):
# only check once per choice, because prompt_token_ids
# are the same for all deltas in that choice
prompt_is_reasoning_end_arr[i] = (
reasoning_parser.is_reasoning_end(res.prompt_token_ids)
)
if finish_reason_sent[i]: if finish_reason_sent[i]:
continue continue
...@@ -926,13 +937,11 @@ class OpenAIServingChat(OpenAIServing): ...@@ -926,13 +937,11 @@ class OpenAIServingChat(OpenAIServing):
# i.e {"enable_thinking": False}, # i.e {"enable_thinking": False},
# set reasoning status to end. # set reasoning status to end.
# Only keep 'content', remove 'reasoning'. # Only keep 'content', remove 'reasoning'.
if reasoning_parser.is_reasoning_end( if (
as_list(output.token_ids) reasoning_parser.is_reasoning_end(
) or ( as_list(output.token_ids)
res.prompt_token_ids
and reasoning_parser.is_reasoning_end(
res.prompt_token_ids
) )
or prompt_is_reasoning_end_arr[i]
): ):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
if delta_message and delta_message.content: if delta_message and delta_message.content:
...@@ -991,8 +1000,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -991,8 +1000,7 @@ class OpenAIServingChat(OpenAIServing):
if ( if (
self.reasoning_parser is not None self.reasoning_parser is not None
and not reasoning_end_arr[i] and not reasoning_end_arr[i]
and res.prompt_token_ids and prompt_is_reasoning_end_arr[i]
and reasoning_parser.is_reasoning_end(res.prompt_token_ids)
): ):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
...@@ -1049,12 +1057,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1049,12 +1057,7 @@ class OpenAIServingChat(OpenAIServing):
# When encountering think end id in prompt_token_ids # When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False}, # i.e {"enable_thinking": False},
# set reasoning status to end. # set reasoning status to end.
if ( if prompt_is_reasoning_end_arr[i]:
res.prompt_token_ids
and reasoning_parser.is_reasoning_end(
res.prompt_token_ids
)
):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
current_token_ids = output_token_ids current_token_ids = output_token_ids
# Don't update current_text, keep it as is from delta # Don't update current_text, keep it as is from delta
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment