"vscode:/vscode.git/clone" did not exist on "fee1d5813c3dff0d31d4c221bcb6252a69c80236"
Unverified Commit d9ab1ad9 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

`reasoning_content` -> `reasoning` (#27752)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 608bb144
...@@ -28,49 +28,49 @@ def seedoss_tokenizer(): ...@@ -28,49 +28,49 @@ def seedoss_tokenizer():
SIMPLE_REASONING: dict[str, Any] = { SIMPLE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>This is the rest", "output": "This is a reasoning section</seed:think>This is the rest",
"reasoning_content": "This is a reasoning section", "reasoning": "This is a reasoning section",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
COMPLETE_REASONING: dict[str, Any] = { COMPLETE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>", "output": "This is a reasoning section</seed:think>",
"reasoning_content": "This is a reasoning section", "reasoning": "This is a reasoning section",
"content": None, "content": None,
"is_reasoning_end": True, "is_reasoning_end": True,
} }
NO_CONTENT: dict[str, Any] = { NO_CONTENT: dict[str, Any] = {
"output": "This is content", "output": "This is content",
"reasoning_content": "This is content", "reasoning": "This is content",
"content": None, "content": None,
"is_reasoning_end": False, "is_reasoning_end": False,
} }
NO_REASONING_STREAMING: dict[str, Any] = { NO_REASONING_STREAMING: dict[str, Any] = {
"output": "This is a reasoning section", "output": "This is a reasoning section",
"reasoning_content": "This is a reasoning section", "reasoning": "This is a reasoning section",
"content": None, "content": None,
"is_reasoning_end": False, "is_reasoning_end": False,
} }
MULTIPLE_LINES: dict[str, Any] = { MULTIPLE_LINES: dict[str, Any] = {
"output": "This\nThat</seed:think>This is the rest\nThat", "output": "This\nThat</seed:think>This is the rest\nThat",
"reasoning_content": "This\nThat", "reasoning": "This\nThat",
"content": "This is the rest\nThat", "content": "This is the rest\nThat",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
WITH_START_TOKEN: dict[str, Any] = { WITH_START_TOKEN: dict[str, Any] = {
"output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"), "output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
"reasoning_content": "This is a reasoning section", "reasoning": "This is a reasoning section",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
ONLY_END_TOKEN: dict[str, Any] = { ONLY_END_TOKEN: dict[str, Any] = {
"output": "Some reasoning</seed:think>This is the rest", "output": "Some reasoning</seed:think>This is the rest",
"reasoning_content": "Some reasoning", "reasoning": "Some reasoning",
"content": "This is the rest", "content": "This is the rest",
"is_reasoning_end": True, "is_reasoning_end": True,
} }
NO_TOKENS: dict[str, Any] = { NO_TOKENS: dict[str, Any] = {
"output": "This is just content without any reasoning tokens", "output": "This is just content without any reasoning tokens",
"reasoning_content": "This is just content without any reasoning tokens", "reasoning": "This is just content without any reasoning tokens",
"content": None, "content": None,
"is_reasoning_end": False, "is_reasoning_end": False,
} }
...@@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming): ...@@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming):
parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
) )
assert reasoning == SIMPLE_REASONING["reasoning_content"] assert reasoning == SIMPLE_REASONING["reasoning"]
assert content == SIMPLE_REASONING["content"] assert content == SIMPLE_REASONING["content"]
...@@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming): ...@@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming):
parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
) )
assert reasoning == COMPLETE_REASONING["reasoning_content"] assert reasoning == COMPLETE_REASONING["reasoning"]
assert content == COMPLETE_REASONING["content"] assert content == COMPLETE_REASONING["content"]
...@@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming): ...@@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming):
parser, [cast(str, NO_CONTENT["output"])], streaming=streaming parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
) )
assert reasoning == NO_CONTENT["reasoning_content"] assert reasoning == NO_CONTENT["reasoning"]
assert content == NO_CONTENT["content"] assert content == NO_CONTENT["content"]
...@@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming): ...@@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming):
parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
) )
assert reasoning == MULTIPLE_LINES["reasoning_content"] assert reasoning == MULTIPLE_LINES["reasoning"]
assert content == MULTIPLE_LINES["content"] assert content == MULTIPLE_LINES["content"]
...@@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming): ...@@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming):
parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
) )
assert reasoning == WITH_START_TOKEN["reasoning_content"] assert reasoning == WITH_START_TOKEN["reasoning"]
assert content == WITH_START_TOKEN["content"] assert content == WITH_START_TOKEN["content"]
...@@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming): ...@@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming):
parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
) )
assert reasoning == ONLY_END_TOKEN["reasoning_content"] assert reasoning == ONLY_END_TOKEN["reasoning"]
assert content == ONLY_END_TOKEN["content"] assert content == ONLY_END_TOKEN["content"]
...@@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming): ...@@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming):
parser, [cast(str, NO_TOKENS["output"])], streaming=streaming parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
) )
assert reasoning == NO_TOKENS["reasoning_content"] assert reasoning == NO_TOKENS["reasoning"]
assert content == NO_TOKENS["content"] assert content == NO_TOKENS["content"]
......
...@@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer ...@@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
class StreamingReasoningReconstructor: class StreamingReasoningReconstructor:
def __init__(self): def __init__(self):
self.reasoning_content = None self.reasoning = None
self.other_content = None self.other_content = None
def append_delta(self, delta: DeltaMessage): def append_delta(self, delta: DeltaMessage):
# content and the reasoning content should not be present # content and the reasoning content should not be present
# at the same time # at the same time
assert delta.content is None or delta.reasoning_content is None, ( assert delta.content is None or delta.reasoning is None, (
"Both content and reasoning content are present in the delta message" "Both content and reasoning content are present in the delta message"
) )
assert delta.reasoning == delta.reasoning_content, (
"reasoning_content should be present for backwards compatibility"
)
if delta.content is not None: if delta.content is not None:
if self.other_content is None: if self.other_content is None:
self.other_content = delta.content self.other_content = delta.content
else: else:
self.other_content += delta.content self.other_content += delta.content
else: else:
if self.reasoning_content is None: if self.reasoning is None:
self.reasoning_content = delta.reasoning_content self.reasoning = delta.reasoning
else: else:
self.reasoning_content += delta.reasoning_content self.reasoning += delta.reasoning
def run_reasoning_extraction( def run_reasoning_extraction(
...@@ -43,7 +46,7 @@ def run_reasoning_extraction( ...@@ -43,7 +46,7 @@ def run_reasoning_extraction(
request, request,
) )
return ( return (
reconstructor.reasoning_content, reconstructor.reasoning,
reconstructor.other_content or None, reconstructor.other_content or None,
) )
else: else:
...@@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral( ...@@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral(
request, request,
) )
return ( return (
reconstructor.reasoning_content, reconstructor.reasoning,
reconstructor.other_content or None, reconstructor.other_content or None,
) )
else: else:
...@@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming( ...@@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming(
request: ChatCompletionRequest | None = None, request: ChatCompletionRequest | None = None,
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
request = request or ChatCompletionRequest(messages=[], model="test-model") request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning_content( return reasoning_parser.extract_reasoning(
model_output="".join(model_output), request=request model_output="".join(model_output), request=request
) )
...@@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming( ...@@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming(
] ]
current_text = previous_text + delta current_text = previous_text + delta
current_tokens = previous_tokens + token_delta current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming( delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta, delta,
...@@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral( ...@@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral(
delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0] delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
current_text = previous_text + delta current_text = previous_text + delta
current_tokens = previous_tokens + token_delta current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming( delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta, delta,
......
...@@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages( ...@@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
assert actual_request == expected_mistral_output assert actual_request == expected_mistral_output
# Tool use with list content and reasoning_content # Tool use with list content and reasoning
@pytest.mark.parametrize( @pytest.mark.parametrize(
"openai_request,expected_mistral_output", "openai_request,expected_mistral_output",
[ [
...@@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages( ...@@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
}, },
{ {
"role": "assistant", "role": "assistant",
"reasoning_content": None, "reasoning": None,
"content": None, "content": None,
"tool_calls": [ "tool_calls": [
{ {
......
...@@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental( ...@@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental(
if ( if (
delta_message.role is None delta_message.role is None
and delta_message.content is None and delta_message.content is None
and delta_message.reasoning_content is None and delta_message.reasoning is None
and len(delta_message.tool_calls) == 0 and len(delta_message.tool_calls) == 0
): ):
continue continue
......
...@@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices( ...@@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices(
assert output is not None and isinstance(output, RequestOutput) assert output is not None and isinstance(output, RequestOutput)
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text]) reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}") print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
assert content is not None and reasoning_content is not None assert content is not None and reasoning is not None
output_json = json.loads(content) output_json = json.loads(content)
jsonschema.validate(instance=output_json, schema=reasoning_schema) jsonschema.validate(instance=output_json, schema=reasoning_schema)
......
...@@ -521,15 +521,15 @@ def parse_chat_output( ...@@ -521,15 +521,15 @@ def parse_chat_output(
is_tool_call = False # TODO: update this when tool call is supported is_tool_call = False # TODO: update this when tool call is supported
if len(output_msgs) == 0: if len(output_msgs) == 0:
# The generation has stopped during reasoning. # The generation has stopped during reasoning.
reasoning_content = parser.current_content reasoning = parser.current_content
final_content = None final_content = None
elif len(output_msgs) == 1: elif len(output_msgs) == 1:
# The generation has stopped during final message. # The generation has stopped during final message.
reasoning_content = output_msgs[0].content[0].text reasoning = output_msgs[0].content[0].text
final_content = parser.current_content final_content = parser.current_content
else: else:
reasoning_msg = output_msgs[:-1] reasoning_msg = output_msgs[:-1]
final_msg = output_msgs[-1] final_msg = output_msgs[-1]
reasoning_content = "\n".join([msg.content[0].text for msg in reasoning_msg]) reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
final_content = final_msg.content[0].text final_content = final_msg.content[0].text
return reasoning_content, final_content, is_tool_call return reasoning, final_content, is_tool_call
...@@ -2102,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel): ...@@ -2102,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel):
tool_calls: list[ToolCall] = Field(default_factory=list) tool_calls: list[ToolCall] = Field(default_factory=list)
# vLLM-specific fields that are not in OpenAI spec # vLLM-specific fields that are not in OpenAI spec
reasoning: str | None = None
reasoning_content: str | None = None reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionLogProb(OpenAIBaseModel): class ChatCompletionLogProb(OpenAIBaseModel):
...@@ -2156,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel): ...@@ -2156,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel):
class DeltaMessage(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel):
role: str | None = None role: str | None = None
content: str | None = None content: str | None = None
reasoning: str | None = None
reasoning_content: str | None = None reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
tool_calls: list[DeltaToolCall] = Field(default_factory=list) tool_calls: list[DeltaToolCall] = Field(default_factory=list)
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionResponseStreamChoice(OpenAIBaseModel): class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
index: int index: int
......
...@@ -759,9 +759,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -759,9 +759,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message = DeltaMessage(content=delta_text) delta_message = DeltaMessage(content=delta_text)
elif cur_channel == "analysis": elif cur_channel == "analysis":
if request.include_reasoning: if request.include_reasoning:
delta_message = DeltaMessage( delta_message = DeltaMessage(reasoning=delta_text)
reasoning_content=delta_text
)
else: else:
delta_message = None delta_message = None
elif ( elif (
...@@ -823,7 +821,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -823,7 +821,7 @@ class OpenAIServingChat(OpenAIServing):
): ):
assert reasoning_parser is not None assert reasoning_parser is not None
delta_message = ( delta_message = (
reasoning_parser.extract_reasoning_content_streaming( reasoning_parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta_text, delta_text,
...@@ -836,7 +834,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -836,7 +834,7 @@ class OpenAIServingChat(OpenAIServing):
# or think end id in prompt_token_ids # or think end id in prompt_token_ids
# i.e {"enable_thinking": False}, # i.e {"enable_thinking": False},
# set reasoning status to end. # set reasoning status to end.
# Only keep 'content', remove 'reasoning_content'. # Only keep 'content', remove 'reasoning'.
if reasoning_parser.is_reasoning_end( if reasoning_parser.is_reasoning_end(
as_list(output.token_ids) as_list(output.token_ids)
) or ( ) or (
...@@ -899,7 +897,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -899,7 +897,7 @@ class OpenAIServingChat(OpenAIServing):
if self.reasoning_parser and not reasoning_end_arr[i]: if self.reasoning_parser and not reasoning_end_arr[i]:
delta_message = ( delta_message = (
reasoning_parser.extract_reasoning_content_streaming( reasoning_parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta_text, delta_text,
...@@ -948,7 +946,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -948,7 +946,7 @@ class OpenAIServingChat(OpenAIServing):
output_token_ids = as_list(output.token_ids) output_token_ids = as_list(output.token_ids)
if not reasoning_end_arr[i]: if not reasoning_end_arr[i]:
delta_message = ( delta_message = (
reasoning_parser.extract_reasoning_content_streaming( reasoning_parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta_text, delta_text,
...@@ -961,7 +959,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -961,7 +959,7 @@ class OpenAIServingChat(OpenAIServing):
# i.e {"enable_thinking": False}, # i.e {"enable_thinking": False},
# set reasoning status to end. # set reasoning status to end.
# Remove the text and token ids related # Remove the text and token ids related
# to 'reasoning_content'. # to 'reasoning'.
if ( if (
res.prompt_token_ids res.prompt_token_ids
and reasoning_parser.is_reasoning_end( and reasoning_parser.is_reasoning_end(
...@@ -978,7 +976,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -978,7 +976,7 @@ class OpenAIServingChat(OpenAIServing):
# When encountering think end id in delta_token_ids, # When encountering think end id in delta_token_ids,
# set reasoning status to end. # set reasoning status to end.
# Remove the text and token ids related # Remove the text and token ids related
# to 'reasoning_content'. # to 'reasoning'.
if reasoning_parser.is_reasoning_end(output_token_ids): if reasoning_parser.is_reasoning_end(output_token_ids):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
current_token_ids = ( current_token_ids = (
...@@ -1033,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1033,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing):
# when only reasoning # when only reasoning
elif self.reasoning_parser: elif self.reasoning_parser:
delta_message = ( delta_message = reasoning_parser.extract_reasoning_streaming(
reasoning_parser.extract_reasoning_content_streaming( previous_text,
previous_text, current_text,
current_text, delta_text,
delta_text, previous_token_ids,
previous_token_ids, current_token_ids,
current_token_ids, output.token_ids,
output.token_ids,
)
) )
# handle streaming just a content delta # handle streaming just a content delta
else: else:
...@@ -1334,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1334,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing):
logprobs = None logprobs = None
if self.use_harmony: if self.use_harmony:
reasoning_content, content, _ = parse_chat_output(token_ids) reasoning, content, _ = parse_chat_output(token_ids)
if not request.include_reasoning: if not request.include_reasoning:
reasoning_content = None reasoning = None
if self.tool_parser is not None: if self.tool_parser is not None:
tool_parser = self.tool_parser(tokenizer) tool_parser = self.tool_parser(tokenizer)
...@@ -1349,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1349,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing):
content = tool_call_info.content content = tool_call_info.content
message = ChatMessage( message = ChatMessage(
role=role, role=role,
reasoning_content=reasoning_content, reasoning=reasoning,
content=content, content=content,
tool_calls=tool_call_info.tool_calls, tool_calls=tool_call_info.tool_calls,
) )
else: else:
message = ChatMessage( message = ChatMessage(
role=role, role=role,
reasoning_content=reasoning_content, reasoning=reasoning,
content=content, content=content,
) )
...@@ -1390,13 +1386,13 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1390,13 +1386,13 @@ class OpenAIServingChat(OpenAIServing):
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# If the reasoning parser is enabled, # If the reasoning parser is enabled,
# tool calls are extracted exclusively from the content. # tool calls are extracted exclusively from the content.
reasoning_content, content = reasoning_parser.extract_reasoning_content( reasoning, content = reasoning_parser.extract_reasoning(
output.text, request=request output.text, request=request
) )
if not request.include_reasoning: if not request.include_reasoning:
reasoning_content = None reasoning = None
else: else:
reasoning_content = None reasoning = None
content = output.text content = output.text
auto_tools_called = False auto_tools_called = False
...@@ -1416,9 +1412,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1416,9 +1412,7 @@ class OpenAIServingChat(OpenAIServing):
not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam) not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
and request.tool_choice != "required" and request.tool_choice != "required"
): ):
message = ChatMessage( message = ChatMessage(role=role, reasoning=reasoning, content=content)
role=role, reasoning_content=reasoning_content, content=content
)
# if the request uses tools and specified a tool choice # if the request uses tools and specified a tool choice
elif ( elif (
...@@ -1428,7 +1422,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1428,7 +1422,7 @@ class OpenAIServingChat(OpenAIServing):
assert tool_calls is not None and len(tool_calls) > 0 assert tool_calls is not None and len(tool_calls) > 0
message = ChatMessage( message = ChatMessage(
role=role, role=role,
reasoning_content=reasoning_content, reasoning=reasoning,
content="", content="",
tool_calls=[tool_call_class(function=tc) for tc in tool_calls], tool_calls=[tool_call_class(function=tc) for tc in tool_calls],
) )
...@@ -1452,15 +1446,13 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1452,15 +1446,13 @@ class OpenAIServingChat(OpenAIServing):
role=role, role=role,
content="", content="",
tool_calls=tool_call_class_items, tool_calls=tool_call_class_items,
reasoning_content=reasoning_content, reasoning=reasoning,
) )
# if the request doesn't use tool choice # if the request doesn't use tool choice
# OR specifies to not use a tool # OR specifies to not use a tool
elif not request.tool_choice or request.tool_choice == "none": elif not request.tool_choice or request.tool_choice == "none":
message = ChatMessage( message = ChatMessage(role=role, reasoning=reasoning, content=content)
role=role, reasoning_content=reasoning_content, content=content
)
# handle when there are tools and tool choice is auto # handle when there are tools and tool choice is auto
elif ( elif (
...@@ -1476,7 +1468,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1476,7 +1468,7 @@ class OpenAIServingChat(OpenAIServing):
if tool_calls: if tool_calls:
message = ChatMessage( message = ChatMessage(
role=role, role=role,
reasoning_content=reasoning_content, reasoning=reasoning,
content=content, content=content,
tool_calls=[ tool_calls=[
ToolCall( ToolCall(
...@@ -1498,7 +1490,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1498,7 +1490,7 @@ class OpenAIServingChat(OpenAIServing):
ret_content = content ret_content = content
message = ChatMessage( message = ChatMessage(
role=role, role=role,
reasoning_content=reasoning_content, reasoning=reasoning,
content=ret_content, content=ret_content,
) )
...@@ -1509,9 +1501,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -1509,9 +1501,7 @@ class OpenAIServingChat(OpenAIServing):
" if tools should be extracted. Returning a standard chat " " if tools should be extracted. Returning a standard chat "
"completion." "completion."
) )
message = ChatMessage( message = ChatMessage(role=role, reasoning=reasoning, content=content)
role=role, reasoning_content=reasoning_content, content=content
)
# In OpenAI's API, when a tool is called, the finish_reason is: # In OpenAI's API, when a tool is called, the finish_reason is:
# "tool_calls" for "auto" or "required" tool calls, # "tool_calls" for "auto" or "required" tool calls,
# and "stop" for named tool calls. # and "stop" for named tool calls.
......
...@@ -778,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -778,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing):
logger.exception("Error in reasoning parser creation.") logger.exception("Error in reasoning parser creation.")
raise e raise e
reasoning_content, content = reasoning_parser.extract_reasoning_content( reasoning, content = reasoning_parser.extract_reasoning(
final_output.text, request=request final_output.text, request=request
) )
else: else:
reasoning_content = None reasoning = None
content = final_output.text content = final_output.text
# Log complete response if output logging is enabled # Log complete response if output logging is enabled
...@@ -790,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -790,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing):
output_text = "" output_text = ""
if content: if content:
output_text = content output_text = content
elif reasoning_content: elif reasoning:
output_text = f"[reasoning: {reasoning_content}]" output_text = f"[reasoning: {reasoning}]"
if output_text: if output_text:
self.request_logger.log_outputs( self.request_logger.log_outputs(
...@@ -805,15 +805,13 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -805,15 +805,13 @@ class OpenAIServingResponses(OpenAIServing):
reasoning_item = None reasoning_item = None
message_item = None message_item = None
if reasoning_content: if reasoning:
reasoning_item = ResponseReasoningItem( reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}", id=f"rs_{random_uuid()}",
summary=[], summary=[],
type="reasoning", type="reasoning",
content=[ content=[
ResponseReasoningTextContent( ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
text=reasoning_content, type="reasoning_text"
)
], ],
status=None, # NOTE: Only the last output item has status. status=None, # NOTE: Only the last output item has status.
) )
...@@ -1208,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1208,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing):
if ctx.last_output.outputs: if ctx.last_output.outputs:
output = ctx.last_output.outputs[0] output = ctx.last_output.outputs[0]
if reasoning_parser: if reasoning_parser:
delta_message = ( delta_message = reasoning_parser.extract_reasoning_streaming(
reasoning_parser.extract_reasoning_content_streaming( previous_text=previous_text,
previous_text=previous_text, current_text=previous_text + output.text,
current_text=previous_text + output.text, delta_text=output.text,
delta_text=output.text, previous_token_ids=previous_token_ids,
previous_token_ids=previous_token_ids, current_token_ids=previous_token_ids + output.token_ids,
current_token_ids=previous_token_ids + output.token_ids, delta_token_ids=output.token_ids,
delta_token_ids=output.token_ids,
)
) )
else: else:
delta_message = DeltaMessage( delta_message = DeltaMessage(
...@@ -1228,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1228,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing):
continue continue
if not first_delta_sent: if not first_delta_sent:
current_item_id = str(uuid.uuid4()) current_item_id = str(uuid.uuid4())
if delta_message.reasoning_content: if delta_message.reasoning:
yield _increment_sequence_number_and_return( yield _increment_sequence_number_and_return(
ResponseOutputItemAddedEvent( ResponseOutputItemAddedEvent(
type="response.output_item.added", type="response.output_item.added",
...@@ -1280,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1280,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing):
# same as content or reasoning content # same as content or reasoning content
if ( if (
previous_delta_messages previous_delta_messages
and previous_delta_messages[-1].reasoning_content is not None and previous_delta_messages[-1].reasoning is not None
and delta_message.content is not None and delta_message.content is not None
): ):
# from reasoning to normal content, send done # from reasoning to normal content, send done
# event for reasoning # event for reasoning
reason_content = "".join( reason_content = "".join(
pm.reasoning_content pm.reasoning
for pm in previous_delta_messages for pm in previous_delta_messages
if pm.reasoning_content is not None if pm.reasoning is not None
) )
yield _increment_sequence_number_and_return( yield _increment_sequence_number_and_return(
ResponseReasoningTextDoneEvent( ResponseReasoningTextDoneEvent(
...@@ -1356,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1356,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing):
# reset previous delta messages # reset previous delta messages
previous_delta_messages = [] previous_delta_messages = []
if delta_message.reasoning_content is not None: if delta_message.reasoning is not None:
yield _increment_sequence_number_and_return( yield _increment_sequence_number_and_return(
ResponseReasoningTextDeltaEvent( ResponseReasoningTextDeltaEvent(
type="response.reasoning_text.delta", type="response.reasoning_text.delta",
...@@ -1364,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1364,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing):
content_index=current_content_index, content_index=current_content_index,
output_index=current_output_index, output_index=current_output_index,
item_id=current_item_id, item_id=current_item_id,
delta=delta_message.reasoning_content, delta=delta_message.reasoning,
) )
) )
elif delta_message.content is not None: elif delta_message.content is not None:
...@@ -1392,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1392,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing):
previous_delta_messages.append(delta_message) previous_delta_messages.append(delta_message)
if previous_delta_messages: if previous_delta_messages:
if previous_delta_messages[-1].reasoning_content is not None: if previous_delta_messages[-1].reasoning is not None:
reason_content = "".join( reason_content = "".join(
pm.reasoning_content pm.reasoning
for pm in previous_delta_messages for pm in previous_delta_messages
if pm.reasoning_content is not None if pm.reasoning is not None
) )
yield _increment_sequence_number_and_return( yield _increment_sequence_number_and_return(
ResponseReasoningTextDoneEvent( ResponseReasoningTextDoneEvent(
......
...@@ -279,7 +279,7 @@ class StreamingXMLToolCallParser: ...@@ -279,7 +279,7 @@ class StreamingXMLToolCallParser:
final_delta = DeltaMessage( final_delta = DeltaMessage(
role=None, role=None,
content=None, content=None,
reasoning_content=None, reasoning=None,
tool_calls=[ tool_calls=[
DeltaToolCall( DeltaToolCall(
index=self.tool_call_index - 1, index=self.tool_call_index - 1,
......
...@@ -76,7 +76,7 @@ class ReasoningParser: ...@@ -76,7 +76,7 @@ class ReasoningParser:
""" """
@abstractmethod @abstractmethod
def extract_reasoning_content( def extract_reasoning(
self, self,
model_output: str, model_output: str,
request: ChatCompletionRequest | ResponsesRequest, request: ChatCompletionRequest | ResponsesRequest,
...@@ -100,7 +100,7 @@ class ReasoningParser: ...@@ -100,7 +100,7 @@ class ReasoningParser:
""" """
@abstractmethod @abstractmethod
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
......
...@@ -76,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser): ...@@ -76,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
else: else:
return input_ids[input_ids.index(self.end_token_id) + 1 :] return input_ids[input_ids.index(self.end_token_id) + 1 :]
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -103,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser): ...@@ -103,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser):
# start token in previous, end token in delta, # start token in previous, end token in delta,
# extract reasoning content # extract reasoning content
end_index = delta_text.find(self.end_token) end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:end_index] reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :] content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning, content=content if content else None
content=content if content else None,
) )
elif self.end_token_id in previous_token_ids: elif self.end_token_id in previous_token_ids:
# start token in previous, end token in previous, # start token in previous, end token in previous,
...@@ -116,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser): ...@@ -116,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser):
else: else:
# start token in previous, no end token in previous or delta, # start token in previous, no end token in previous or delta,
# reasoning content continues # reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
elif self.start_token_id in delta_token_ids: elif self.start_token_id in delta_token_ids:
if self.end_token_id in delta_token_ids: if self.end_token_id in delta_token_ids:
# start token in delta, end token in delta, # start token in delta, end token in delta,
# extract reasoning content # extract reasoning content
start_index = delta_text.find(self.start_token) start_index = delta_text.find(self.start_token)
end_index = delta_text.find(self.end_token) end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[ reasoning = delta_text[start_index + len(self.start_token) : end_index]
start_index + len(self.start_token) : end_index
]
content = delta_text[end_index + len(self.end_token) :] content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning, content=content if content else None
content=content if content else None,
) )
else: else:
# start token in delta, no end token in delta, # start token in delta, no end token in delta,
# reasoning content continues # reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
else: else:
# not find thinking start token # not find thinking start token
return DeltaMessage(content=delta_text) return DeltaMessage(content=delta_text)
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
""" """
...@@ -160,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser): ...@@ -160,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
if self.end_token not in model_output: if self.end_token not in model_output:
return model_output, None return model_output, None
else: else:
reasoning_content, _, content = model_output.partition(self.end_token) reasoning, _, content = model_output.partition(self.end_token)
# If generation stops right after end-of-think, return null content # If generation stops right after end-of-think, return null content
final_content = content or None final_content = content or None
return reasoning_content, final_content return reasoning, final_content
...@@ -25,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): ...@@ -25,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
"""The token that ends reasoning content.""" """The token that ends reasoning content."""
return "</think>" return "</think>"
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -34,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): ...@@ -34,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
) -> DeltaMessage | None: ) -> DeltaMessage | None:
ret = super().extract_reasoning_content_streaming( ret = super().extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta_text, delta_text,
...@@ -51,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): ...@@ -51,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
# end token in delta with more tokens, # end token in delta with more tokens,
# extract reasoning content and content # extract reasoning content and content
end_index = delta_text.find(self.end_token) end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:end_index] reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :] content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning,
content=content if content else None, content=content if content else None,
) )
elif self.end_token_id in previous_token_ids: elif self.end_token_id in previous_token_ids:
...@@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser): ...@@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
return DeltaMessage(content=delta_text) return DeltaMessage(content=delta_text)
else: else:
# no end token in previous or delta, reasoning content continues # no end token in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
return ret return ret
...@@ -38,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser): ...@@ -38,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
def extract_content_ids(self, input_ids: list[int]) -> list[int]: def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids) return self._parser.extract_content_ids(input_ids)
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning_content(model_output, request) return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -52,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser): ...@@ -52,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
) -> DeltaMessage | None: ) -> DeltaMessage | None:
return self._parser.extract_reasoning_content_streaming( return self._parser.extract_reasoning_streaming(
previous_text, previous_text,
current_text, current_text,
delta_text, delta_text,
......
...@@ -57,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -57,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
"tokens in the tokenizer!" "tokens in the tokenizer!"
) )
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
The Ernie45 thinking model ouput format is The Ernie45 thinking model ouput format is
abc\n</think>\n\n<response>\ndef\n</response>\n abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef or abc\n</think>\ndef
- 'abc' goes to reasoning_content - 'abc' goes to reasoning
- 'def' goes to content - 'def' goes to content
""" """
# Skip single special tokens # Skip single special tokens
...@@ -94,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -94,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
# </think> in delta with more tokens, # </think> in delta with more tokens,
# extract reasoning content and content # extract reasoning content and content
think_end_index = delta_text.find(self.end_token) think_end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:think_end_index] reasoning = delta_text[:think_end_index]
content = delta_text[think_end_index + len(self.end_token) :] content = delta_text[think_end_index + len(self.end_token) :]
content = content.lstrip("\n") content = content.lstrip("\n")
response_start_idx = content.find(self.response_start_token) response_start_idx = content.find(self.response_start_token)
...@@ -104,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -104,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
if response_end_idx != -1: if response_end_idx != -1:
content = content[:response_end_idx] content = content[:response_end_idx]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning,
content=content if content else None, content=content if content else None,
) )
elif self.end_token_id in previous_token_ids: elif self.end_token_id in previous_token_ids:
...@@ -138,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -138,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
return DeltaMessage(content=content if content else None) return DeltaMessage(content=content if content else None)
else: else:
# no </think> in previous or delta, reasoning content continues # no </think> in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
""" """
...@@ -148,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -148,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
The Ernie45 thinking model ouput format is The Ernie45 thinking model ouput format is
abc\n</think>\n\n\n<response>\ndef\n</response>\n abc\n</think>\n\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef or abc\n</think>\ndef
- 'abc' goes to reasoning_content - 'abc' goes to reasoning
- 'def' goes to content - 'def' goes to content
Returns: Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content tuple[Optional[str], Optional[str]]: reasoning content and content
""" """
reasoning_content, content = super().extract_reasoning_content( reasoning, content = super().extract_reasoning(model_output, request)
model_output, request
)
if content: if content:
start_idx = content.find(self.response_start_token) start_idx = content.find(self.response_start_token)
end_idx = content.rfind(self.response_end_token) end_idx = content.rfind(self.response_end_token)
...@@ -164,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -164,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
content = content[start_idx + len(self.response_start_token) : end_idx] content = content[start_idx + len(self.response_start_token) : end_idx]
final_content = content or None final_content = content or None
return reasoning_content, final_content return reasoning, final_content
...@@ -70,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): ...@@ -70,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
else: else:
return input_ids[input_ids.index(self.think_end_token_id) + 1 :] return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -84,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): ...@@ -84,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
Handles streaming output where previous + delta = current. Handles streaming output where previous + delta = current.
Uses token IDs for faster processing. Uses token IDs for faster processing.
For text <think>abc</think>xyz: For text <think>abc</think>xyz:
- 'abc' goes to reasoning_content - 'abc' goes to reasoning
- 'xyz' goes to content - 'xyz' goes to content
""" """
# Skip single special tokens # Skip single special tokens
...@@ -98,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser): ...@@ -98,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
# <think> in previous, </think> in delta, # <think> in previous, </think> in delta,
# extract reasoning content # extract reasoning content
end_index = delta_text.find(self.think_end_token) end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index] reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token) :] content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning,
content=content if content else None, content=content if content else None,
) )
elif self.think_end_token_id in previous_token_ids: elif self.think_end_token_id in previous_token_ids:
...@@ -111,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser): ...@@ -111,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
else: else:
# <think> in previous, no </think> in previous or delta, # <think> in previous, no </think> in previous or delta,
# reasoning content continues # reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
elif self.think_start_token_id in delta_token_ids: elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids: if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content # <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token) start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token) end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[ reasoning = delta_text[
start_index + len(self.think_start_token) : end_index start_index + len(self.think_start_token) : end_index
] ]
content = delta_text[end_index + len(self.think_end_token) :] content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage( return DeltaMessage(
reasoning_content=reasoning_content, reasoning=reasoning,
content=content if content else None, content=content if content else None,
) )
else: else:
# <think> in delta, no </think> in delta, # <think> in delta, no </think> in delta,
# reasoning content continues # reasoning content continues
return DeltaMessage(reasoning_content=delta_text) return DeltaMessage(reasoning=delta_text)
else: else:
# thinking is disabled, just content # thinking is disabled, just content
return DeltaMessage(content=delta_text) return DeltaMessage(content=delta_text)
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
""" """
Extract reasoning content from the model output. Extract reasoning content from the model output.
For text <think>abc</think>xyz: For text <think>abc</think>xyz:
- 'abc' goes to reasoning_content - 'abc' goes to reasoning
- 'xyz' goes to content - 'xyz' goes to content
Returns: Returns:
...@@ -165,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser): ...@@ -165,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
return None, model_output return None, model_output
# Extract reasoning content from the model output. # Extract reasoning content from the model output.
reasoning_content, _, content = model_output.partition(self.think_end_token) reasoning, _, content = model_output.partition(self.think_end_token)
final_content = content or None final_content = content or None
return reasoning_content, final_content return reasoning, final_content
...@@ -104,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser): ...@@ -104,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser):
return [] return []
return self.model_tokenizer.encode(content) return self.model_tokenizer.encode(content)
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -131,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser): ...@@ -131,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser):
content_delta = cur_content content_delta = cur_content
if reasoning_delta is None and content_delta is None: if reasoning_delta is None and content_delta is None:
return None return None
return DeltaMessage(reasoning_content=reasoning_delta, content=content_delta) return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
def extract_reasoning_content( def extract_reasoning(
self, self,
model_output: str, model_output: str,
request: ChatCompletionRequest, request: ChatCompletionRequest,
......
...@@ -49,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -49,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser):
len(think_start) for think_start in self.valid_think_starts len(think_start) for think_start in self.valid_think_starts
) )
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively. """Extract the reasoning content & content sections, respectively.
...@@ -67,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -67,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser):
re_match = self.reasoning_regex.findall(model_output) re_match = self.reasoning_regex.findall(model_output)
if not re_match: if not re_match:
return None, model_output return None, model_output
reasoning_content, response_content = re_match[0] reasoning, response_content = re_match[0]
if not response_content: if not response_content:
return reasoning_content, None return reasoning, None
return reasoning_content, response_content return reasoning, response_content
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -107,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -107,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser):
Union[DeltaMessage, None] Union[DeltaMessage, None]
DeltaMessage with either reasoning content or content, or None. DeltaMessage with either reasoning content or content, or None.
""" """
reasoning_content, resp_seq_len, content = self._get_content_sections( reasoning, resp_seq_len, content = self._get_content_sections(current_text)
current_text
)
# Either we haven't finished the start of the reasoning sequence, # Either we haven't finished the start of the reasoning sequence,
# or the model is generating something unexpected. # or the model is generating something unexpected.
if not reasoning_content: if not reasoning:
delta_message = self._get_delta_message_with_no_reasoning_bounds( delta_message = self._get_delta_message_with_no_reasoning_bounds(
current_text, delta_text current_text, delta_text
) )
...@@ -120,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -120,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser):
# the start of response sequence. # the start of response sequence.
elif not content: elif not content:
delta_message = self._get_delta_message_with_no_response_bounds( delta_message = self._get_delta_message_with_no_response_bounds(
current_text, reasoning_content, delta_text current_text, reasoning, delta_text
) )
# We've finished both the start of reasoning and start of response seq. # We've finished both the start of reasoning and start of response seq.
else: else:
# This should never happen since we matched on the response # This should never happen since we matched on the response
assert resp_seq_len is not None assert resp_seq_len is not None
delta_message = self._get_delta_message_with_both_bounds( delta_message = self._get_delta_message_with_both_bounds(
delta_text, reasoning_content, content, current_text, resp_seq_len delta_text, reasoning, content, current_text, resp_seq_len
) )
if not delta_message.content and not delta_message.reasoning_content: if not delta_message.content and not delta_message.reasoning:
return None return None
return delta_message return delta_message
...@@ -185,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -185,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser):
# message and append everything to content in the future. # message and append everything to content in the future.
if was_substr and not is_substr: if was_substr and not is_substr:
return DeltaMessage( return DeltaMessage(
reasoning_content=None, reasoning=None,
content=current_text, content=current_text,
) )
if is_substr: if is_substr:
# Might still be in the special token sequence; return nothing # Might still be in the special token sequence; return nothing
return DeltaMessage(reasoning_content=None, content=None) return DeltaMessage(reasoning=None, content=None)
# Otherwise the sequence has already been broken and we already # Otherwise the sequence has already been broken and we already
# corrected; just return the delta text as normal content. # corrected; just return the delta text as normal content.
return DeltaMessage(reasoning_content=None, content=delta_text) return DeltaMessage(reasoning=None, content=delta_text)
def _get_delta_message_with_no_response_bounds( def _get_delta_message_with_no_response_bounds(
self, self,
current_text: str, current_text: str,
reasoning_content: str, reasoning: str,
delta_text: str, delta_text: str,
) -> DeltaMessage: ) -> DeltaMessage:
"""Parse the delta message when the current text has both reasoning """Parse the delta message when the current text has both reasoning
...@@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser):
Args: Args:
current_text (str): The full previous + delta text. current_text (str): The full previous + delta text.
reasoning_content (str): reasoning content from current_text. reasoning (str): reasoning content from current_text.
delta_text (str): Text to consider and parse content from. delta_text (str): Text to consider and parse content from.
Returns: Returns:
...@@ -222,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -222,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser):
current_text.endswith(response_start) current_text.endswith(response_start)
for response_start in self.valid_response_starts for response_start in self.valid_response_starts
) )
if reasoning_content is None or ends_with_start_response_seq: if reasoning is None or ends_with_start_response_seq:
return DeltaMessage(reasoning_content=None, content=None) return DeltaMessage(reasoning=None, content=None)
# Consider previous / current text only within context of the reasoning # Consider previous / current text only within context of the reasoning
previous_text = reasoning_content[: -len(delta_text)] previous_text = reasoning[: -len(delta_text)]
current_text = reasoning_content current_text = reasoning
# We need to be careful about adding unfinished response sequences; # We need to be careful about adding unfinished response sequences;
# Find the place at which we MIGHT be starting a response sequence # Find the place at which we MIGHT be starting a response sequence
...@@ -253,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -253,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser):
# Delta only contains potential continued response sequence text. # Delta only contains potential continued response sequence text.
if delta_continues_substr: if delta_continues_substr:
return DeltaMessage(reasoning_content=None, content=None) return DeltaMessage(reasoning=None, content=None)
if not prev_was_substr: if not prev_was_substr:
# Delta may be starting a new response seq but has other text too. # Delta may be starting a new response seq but has other text too.
if delta_new_substr: if delta_new_substr:
return DeltaMessage( return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
reasoning_content=delta_text[:delta_idx], content=None
)
# Normal case for most reasoning text (no potential special seqs). # Normal case for most reasoning text (no potential special seqs).
return DeltaMessage(reasoning_content=delta_text, content=None) return DeltaMessage(reasoning=delta_text, content=None)
# The substring that previously seemed to be a potential response # The substring that previously seemed to be a potential response
# seq wasn't one; we need to add the content to the delta message, # seq wasn't one; we need to add the content to the delta message,
# and also slice off the potential response sequence # and also slice off the potential response sequence
elif delta_new_substr: elif delta_new_substr:
reasoning_content = previous_text[prev_idx:] + delta_text[:delta_idx] reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
return DeltaMessage(reasoning_content=reasoning_content, content=None) return DeltaMessage(reasoning=reasoning, content=None)
# No new substring yet, and we broke our old one; take the whole delta # No new substring yet, and we broke our old one; take the whole delta
return DeltaMessage( return DeltaMessage(
reasoning_content=previous_text[prev_idx:] + delta_text, reasoning=previous_text[prev_idx:] + delta_text,
content=None, content=None,
) )
def _get_delta_message_with_both_bounds( def _get_delta_message_with_both_bounds(
self, self,
delta_text: str, delta_text: str,
reasoning_content: str, reasoning: str,
response_content: str, response_content: str,
current_text: str, current_text: str,
response_seq_len: int, response_seq_len: int,
...@@ -288,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -288,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser):
Args: Args:
delta_text: Text to consider and parse content from. delta_text: Text to consider and parse content from.
reasoning_content: reasoning content from current_text. reasoning: reasoning content from current_text.
response_content: response content from current_text. response_content: response content from current_text.
current_text: The full previous + delta text. current_text: The full previous + delta text.
response_seq_len: Len of the complete response sequence used. response_seq_len: Len of the complete response sequence used.
...@@ -301,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -301,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser):
reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len) reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
if reasoning_end_idx < 0: if reasoning_end_idx < 0:
delta_reasoning_content = None delta_reasoning = None
else: else:
# Get the starting offset # Get the starting offset
start_reasoning_content_idx = ( start_reasoning_idx = (
len(reasoning_content) + response_seq_len + len(response_content) - 1 len(reasoning) + response_seq_len + len(response_content) - 1
) )
delta_offset = len(current_text) - len(delta_text) delta_offset = len(current_text) - len(delta_text)
start_offset = start_reasoning_content_idx - delta_offset start_offset = start_reasoning_idx - delta_offset
if start_offset < 0: if start_offset < 0:
start_offset = 0 start_offset = 0
delta_reasoning_content = delta_text[start_offset:reasoning_end_idx] delta_reasoning = delta_text[start_offset:reasoning_end_idx]
return DeltaMessage( return DeltaMessage(
reasoning_content=delta_reasoning_content, reasoning=delta_reasoning,
content=delta_content, content=delta_content,
) )
...@@ -333,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -333,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser):
(if there is one) and the non-reasoning content. (if there is one) and the non-reasoning content.
""" """
current_chunk_start = 0 current_chunk_start = 0
start_reasoning_content = None start_reasoning = None
parsed_content = False parsed_content = False
delimiter_idxs = [ delimiter_idxs = [
idx idx
...@@ -344,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -344,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser):
for current_chunk_end in delimiter_idxs: for current_chunk_end in delimiter_idxs:
current_chunk = current_text[current_chunk_start:current_chunk_end] current_chunk = current_text[current_chunk_start:current_chunk_end]
# Check to see if the start of reasoning seq if complete # Check to see if the start of reasoning seq if complete
if start_reasoning_content is None: if start_reasoning is None:
for think_start in self.valid_think_starts: for think_start in self.valid_think_starts:
if current_chunk == think_start[:-1]: if current_chunk == think_start[:-1]:
start_reasoning_content = current_chunk_end + 1 start_reasoning = current_chunk_end + 1
current_chunk_start = current_chunk_end + 1 current_chunk_start = current_chunk_end + 1
break break
...@@ -357,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser): ...@@ -357,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser):
if current_chunk[-len(response_start) + 1 :] == response_start[:-1]: if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
# Mark end of reasoning and start response content # Mark end of reasoning and start response content
# after the start of response sequence. # after the start of response sequence.
end_reasoning_content = current_chunk_end - len(response_start) end_reasoning = current_chunk_end - len(response_start)
reasoning_content = current_text[ reasoning = current_text[start_reasoning:end_reasoning]
start_reasoning_content:end_reasoning_content
]
response_content = current_text[current_chunk_end + 1 :] response_content = current_text[current_chunk_end + 1 :]
return reasoning_content, len(response_start), response_content return reasoning, len(response_start), response_content
if start_reasoning_content and not parsed_content: if start_reasoning and not parsed_content:
return current_text[start_reasoning_content:], None, None return current_text[start_reasoning:], None, None
return None, None, None return None, None, None
...@@ -86,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): ...@@ -86,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
# this id is not part of content, so just return [] here. # this id is not part of content, so just return [] here.
return [] return []
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively. """Extract the reasoning content & content sections, respectively.
...@@ -104,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser): ...@@ -104,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser):
re_match = self.full_match_reasoning_regex.findall(model_output) re_match = self.full_match_reasoning_regex.findall(model_output)
if re_match: if re_match:
reasoning_content, response_content = re_match[0] reasoning, response_content = re_match[0]
if len(reasoning_content) == 0: if len(reasoning) == 0:
reasoning_content = None reasoning = None
if len(response_content) == 0: if len(response_content) == 0:
response_content = None response_content = None
return reasoning_content, response_content return reasoning, response_content
fallback_regex = self.half_match_reasoning_regex fallback_regex = self.half_match_reasoning_regex
fallback_match = fallback_regex.findall(model_output) fallback_match = fallback_regex.findall(model_output)
if fallback_match: if fallback_match:
reasoning_content, response_content = fallback_match[0] reasoning, response_content = fallback_match[0]
if response_content.endswith(self.response_end_expr): if response_content.endswith(self.response_end_expr):
response_content = response_content[: -len(self.response_end_expr)] response_content = response_content[: -len(self.response_end_expr)]
if len(reasoning_content) == 0: if len(reasoning) == 0:
reasoning_content = None reasoning = None
if len(response_content) == 0: if len(response_content) == 0:
response_content = None response_content = None
return reasoning_content, response_content return reasoning, response_content
return None, model_output return None, model_output
...@@ -140,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser): ...@@ -140,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
sub_idx += 1 sub_idx += 1
return sub_idx == len(subsequence) return sub_idx == len(subsequence)
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -223,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser): ...@@ -223,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser):
# Return content based on current state # Return content based on current state
if self.current_state == "think": if self.current_state == "think":
return DeltaMessage( return DeltaMessage(reasoning=buffered_content, content=None)
reasoning_content=buffered_content, content=None
)
else: else:
return DeltaMessage( return DeltaMessage(reasoning=None, content=buffered_content)
reasoning_content=None, content=buffered_content
)
else: else:
# No buffered content, send normally # No buffered content, send normally
if self.current_state == "think": if self.current_state == "think":
return DeltaMessage(reasoning_content=delta_text, content=None) return DeltaMessage(reasoning=delta_text, content=None)
else: else:
return DeltaMessage(reasoning_content=None, content=delta_text) return DeltaMessage(reasoning=None, content=delta_text)
# If no content to send in this delta # If no content to send in this delta
return None return None
...@@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser): ...@@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser):
# Identity: return all tokens as content # Identity: return all tokens as content
return input_ids return input_ids
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser): ...@@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser):
return DeltaMessage(content=delta_text) return DeltaMessage(content=delta_text)
return None return None
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
# No reasoning separation: return None for reasoning_content, # No reasoning separation: return None for reasoning,
# and full model_output as content # and full model_output as content
return None, model_output return None, model_output
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment