Unverified Commit 1afe3d07 authored by Xihuai Wang's avatar Xihuai Wang Committed by GitHub
Browse files

Align finish reason and stream mode in openai api (#4388)

parent 44f47d3e
......@@ -645,7 +645,7 @@ def v1_generate_response(
"index": 0,
"text": text,
"logprobs": logprobs,
"finish_reason": (finish_reason["type"] if finish_reason else ""),
"finish_reason": finish_reason["type"] if finish_reason else None,
"matched_stop": (
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -657,7 +657,7 @@ def v1_generate_response(
index=idx,
text=text,
logprobs=logprobs,
finish_reason=(finish_reason["type"] if finish_reason else ""),
finish_reason=finish_reason["type"] if finish_reason else None,
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -805,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
index=index,
text=delta,
logprobs=logprobs,
finish_reason=(finish_reason["type"] if finish_reason else ""),
finish_reason=finish_reason["type"] if finish_reason else None,
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -1216,7 +1216,7 @@ def v1_chat_generate_response(
"reasoning_content": reasoning_text if reasoning_text else None,
},
"logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
"finish_reason": (finish_reason["type"] if finish_reason else ""),
"finish_reason": finish_reason["type"] if finish_reason else None,
"matched_stop": (
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -1233,7 +1233,7 @@ def v1_chat_generate_response(
reasoning_content=reasoning_text if reasoning_text else None,
),
logprobs=choice_logprobs,
finish_reason=(finish_reason["type"] if finish_reason else ""),
finish_reason=finish_reason["type"] if finish_reason else None,
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -1377,23 +1377,11 @@ async def v1_chat_completions(
if is_first:
# First chunk with role
is_first = False
if (
tokenizer_manager.server_args.reasoning_parser
and request.separate_reasoning
):
delta = DeltaMessage(
role="assistant", reasoning_content=None
)
else:
delta = DeltaMessage(role="assistant", content=None)
delta = DeltaMessage(role="assistant")
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=delta,
finish_reason=(
None
if finish_reason_type and len(finish_reason_type) == 0
else finish_reason_type
),
finish_reason=finish_reason_type,
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
......@@ -1434,12 +1422,7 @@ async def v1_chat_completions(
reasoning_text if reasoning_text else None
)
),
finish_reason=(
None
if finish_reason_type
and len(finish_reason_type) == 0
else finish_reason_type
),
finish_reason=finish_reason_type,
)
chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
......@@ -1471,12 +1454,7 @@ async def v1_chat_completions(
delta=DeltaMessage(
content=normal_text if normal_text else None
),
finish_reason=(
None
if finish_reason_type
and len(finish_reason_type) == 0
else finish_reason_type
),
finish_reason=finish_reason_type,
)
chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
......@@ -1490,11 +1468,7 @@ async def v1_chat_completions(
for call_item in calls:
# transform call_item -> FunctionResponse + ToolCall
if (
content["meta_info"]["finish_reason"]
and content["meta_info"]["finish_reason"]["type"]
== "stop"
):
if finish_reason_type == "stop":
latest_delta_len = 0
if isinstance(call_item.parameters, str):
latest_delta_len = len(call_item.parameters)
......@@ -1515,6 +1489,8 @@ async def v1_chat_completions(
)
call_item.parameters = remaining_call
finish_reason_type = "tool_calls"
tool_call = ToolCall(
id=str(call_item.tool_index),
function=FunctionResponse(
......@@ -1524,10 +1500,13 @@ async def v1_chat_completions(
)
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(
role="assistant", tool_calls=[tool_call]
),
finish_reason="tool_call",
delta=DeltaMessage(tool_calls=[tool_call]),
finish_reason=(
None
if request.stream_options
and request.stream_options.include_usage
else finish_reason_type
), # additional chunk will be return
)
chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
......@@ -1542,30 +1521,44 @@ async def v1_chat_completions(
else:
# No tool calls => just treat this as normal text
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(content=delta if delta else None),
finish_reason=(
None
if finish_reason_type and len(finish_reason_type) == 0
else finish_reason_type
),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
logprobs=choice_logprobs,
)
chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
created=created,
choices=[choice_data],
model=request.model,
)
yield f"data: {chunk.model_dump_json()}\n\n"
stream_buffers[index] = new_stream_buffer
is_firsts[index] = is_first
if delta or not (
request.stream_options
and request.stream_options.include_usage
):
choice_data = ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(content=delta if delta else None),
finish_reason=(
None
if request.stream_options
and request.stream_options.include_usage
else finish_reason_type
),
matched_stop=(
finish_reason["matched"]
if finish_reason and "matched" in finish_reason
else None
),
logprobs=choice_logprobs,
)
chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
created=created,
choices=[choice_data],
model=request.model,
)
yield f"data: {chunk.model_dump_json()}\n\n"
stream_buffers[index] = new_stream_buffer
is_firsts[index] = is_first
if finish_reason_type == "stop" and request.tool_choice != "none":
parser = FunctionCallParser(
tools=request.tools,
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
)
if parser.has_tool_call(new_stream_buffer):
# if the stream ends with empty string after tool calls
finish_reason_type = "tool_calls"
if request.stream_options and request.stream_options.include_usage:
total_prompt_tokens = sum(
tokens
......@@ -1590,17 +1583,22 @@ async def v1_chat_completions(
prompt_tokens_details=prompt_tokens_details,
)
final_usage_chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
created=created,
choices=[],
model=request.model,
usage=usage,
)
final_usage_data = final_usage_chunk.model_dump_json(
exclude_none=True
)
yield f"data: {final_usage_data}\n\n"
else:
usage = None
final_usage_chunk = ChatCompletionStreamResponse(
id=content["meta_info"]["id"],
created=created,
choices=[
ChatCompletionResponseStreamChoice(
index=index,
delta=DeltaMessage(),
finish_reason=finish_reason_type,
)
],
model=request.model,
usage=usage,
)
yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
except ValueError as e:
error = create_streaming_error_response(str(e))
yield f"data: {error}\n\n"
......
......@@ -187,7 +187,7 @@ class CompletionResponseChoice(BaseModel):
index: int
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[str] = None
finish_reason: Literal["stop", "length", "content_filter"]
matched_stop: Union[None, int, str] = None
......@@ -204,7 +204,7 @@ class CompletionResponseStreamChoice(BaseModel):
index: int
text: str
logprobs: Optional[LogProbs] = None
finish_reason: Optional[str] = None
finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
matched_stop: Union[None, int, str] = None
......@@ -387,7 +387,9 @@ class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
finish_reason: str
finish_reason: Literal[
"stop", "length", "tool_calls", "content_filter", "function_call"
]
matched_stop: Union[None, int, str] = None
......@@ -411,7 +413,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
finish_reason: Optional[str] = None
finish_reason: Optional[
Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
] = None
matched_stop: Union[None, int, str] = None
......
......@@ -258,7 +258,12 @@ class TestOpenAIServer(CustomTestCase):
ret_num_top_logprobs == logprobs
), f"{ret_num_top_logprobs} vs {logprobs}"
assert isinstance(data.content, str) or response.choices[0].finish_reason
assert (
isinstance(data.content, str)
or isinstance(data.reasoning_content, str)
or len(data.tool_calls) > 0
or response.choices[0].finish_reason
)
assert response.id
assert response.created
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment