Unverified Commit d9ab1ad9 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

`reasoning_content` -> `reasoning` (#27752)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 608bb144
......@@ -28,49 +28,49 @@ def seedoss_tokenizer():
SIMPLE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>This is the rest",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
NO_CONTENT: dict[str, Any] = {
"output": "This is content",
"reasoning_content": "This is content",
"reasoning": "This is content",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING_STREAMING: dict[str, Any] = {
"output": "This is a reasoning section",
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES: dict[str, Any] = {
"output": "This\nThat</seed:think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
WITH_START_TOKEN: dict[str, Any] = {
"output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
"reasoning_content": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
ONLY_END_TOKEN: dict[str, Any] = {
"output": "Some reasoning</seed:think>This is the rest",
"reasoning_content": "Some reasoning",
"reasoning": "Some reasoning",
"content": "This is the rest",
"is_reasoning_end": True,
}
NO_TOKENS: dict[str, Any] = {
"output": "This is just content without any reasoning tokens",
"reasoning_content": "This is just content without any reasoning tokens",
"reasoning": "This is just content without any reasoning tokens",
"content": None,
"is_reasoning_end": False,
}
......@@ -95,7 +95,7 @@ def test_simple_reasoning(seedoss_tokenizer, streaming):
parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
)
assert reasoning == SIMPLE_REASONING["reasoning_content"]
assert reasoning == SIMPLE_REASONING["reasoning"]
assert content == SIMPLE_REASONING["content"]
......@@ -109,7 +109,7 @@ def test_complete_reasoning(seedoss_tokenizer, streaming):
parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
)
assert reasoning == COMPLETE_REASONING["reasoning_content"]
assert reasoning == COMPLETE_REASONING["reasoning"]
assert content == COMPLETE_REASONING["content"]
......@@ -123,7 +123,7 @@ def test_no_content(seedoss_tokenizer, streaming):
parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
)
assert reasoning == NO_CONTENT["reasoning_content"]
assert reasoning == NO_CONTENT["reasoning"]
assert content == NO_CONTENT["content"]
......@@ -137,7 +137,7 @@ def test_multiple_lines(seedoss_tokenizer, streaming):
parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
)
assert reasoning == MULTIPLE_LINES["reasoning_content"]
assert reasoning == MULTIPLE_LINES["reasoning"]
assert content == MULTIPLE_LINES["content"]
......@@ -151,7 +151,7 @@ def test_with_start_token(seedoss_tokenizer, streaming):
parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
)
assert reasoning == WITH_START_TOKEN["reasoning_content"]
assert reasoning == WITH_START_TOKEN["reasoning"]
assert content == WITH_START_TOKEN["content"]
......@@ -168,7 +168,7 @@ def test_only_end_token(seedoss_tokenizer, streaming):
parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
)
assert reasoning == ONLY_END_TOKEN["reasoning_content"]
assert reasoning == ONLY_END_TOKEN["reasoning"]
assert content == ONLY_END_TOKEN["content"]
......@@ -182,7 +182,7 @@ def test_no_tokens(seedoss_tokenizer, streaming):
parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
)
assert reasoning == NO_TOKENS["reasoning_content"]
assert reasoning == NO_TOKENS["reasoning"]
assert content == NO_TOKENS["content"]
......
......@@ -9,25 +9,28 @@ from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
class StreamingReasoningReconstructor:
def __init__(self):
self.reasoning_content = None
self.reasoning = None
self.other_content = None
def append_delta(self, delta: DeltaMessage):
# content and the reasoning content should not be present
# at the same time
assert delta.content is None or delta.reasoning_content is None, (
assert delta.content is None or delta.reasoning is None, (
"Both content and reasoning content are present in the delta message"
)
assert delta.reasoning == delta.reasoning_content, (
"reasoning_content should be present for backwards compatibility"
)
if delta.content is not None:
if self.other_content is None:
self.other_content = delta.content
else:
self.other_content += delta.content
else:
if self.reasoning_content is None:
self.reasoning_content = delta.reasoning_content
if self.reasoning is None:
self.reasoning = delta.reasoning
else:
self.reasoning_content += delta.reasoning_content
self.reasoning += delta.reasoning
def run_reasoning_extraction(
......@@ -43,7 +46,7 @@ def run_reasoning_extraction(
request,
)
return (
reconstructor.reasoning_content,
reconstructor.reasoning,
reconstructor.other_content or None,
)
else:
......@@ -69,7 +72,7 @@ def run_reasoning_extraction_mistral(
request,
)
return (
reconstructor.reasoning_content,
reconstructor.reasoning,
reconstructor.other_content or None,
)
else:
......@@ -88,7 +91,7 @@ def run_reasoning_extraction_nonstreaming(
request: ChatCompletionRequest | None = None,
) -> tuple[str | None, str | None]:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning_content(
return reasoning_parser.extract_reasoning(
model_output="".join(model_output), request=request
)
......@@ -110,7 +113,7 @@ def run_reasoning_extraction_streaming(
]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming(
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta,
......@@ -142,7 +145,7 @@ def run_reasoning_extraction_streaming_mistral(
delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming(
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta,
......
......@@ -102,7 +102,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
assert actual_request == expected_mistral_output
# Tool use with list content and reasoning_content
# Tool use with list content and reasoning
@pytest.mark.parametrize(
"openai_request,expected_mistral_output",
[
......@@ -115,7 +115,7 @@ def test_prepare_apply_chat_template_tools_and_messages(
},
{
"role": "assistant",
"reasoning_content": None,
"reasoning": None,
"content": None,
"tool_calls": [
{
......
......@@ -337,7 +337,7 @@ def test_extract_tool_calls_streaming_incremental(
if (
delta_message.role is None
and delta_message.content is None
and delta_message.reasoning_content is None
and delta_message.reasoning is None
and len(delta_message.tool_calls) == 0
):
continue
......
......@@ -674,10 +674,10 @@ def test_structured_output_with_reasoning_matrices(
assert output is not None and isinstance(output, RequestOutput)
prompt = output.prompt
generated_text = output.outputs[0].text
reasoning_content, content = run_reasoning_extraction(reasoner, [generated_text])
print(f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}")
reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
assert content is not None and reasoning_content is not None
assert content is not None and reasoning is not None
output_json = json.loads(content)
jsonschema.validate(instance=output_json, schema=reasoning_schema)
......
......@@ -521,15 +521,15 @@ def parse_chat_output(
is_tool_call = False # TODO: update this when tool call is supported
if len(output_msgs) == 0:
# The generation has stopped during reasoning.
reasoning_content = parser.current_content
reasoning = parser.current_content
final_content = None
elif len(output_msgs) == 1:
# The generation has stopped during final message.
reasoning_content = output_msgs[0].content[0].text
reasoning = output_msgs[0].content[0].text
final_content = parser.current_content
else:
reasoning_msg = output_msgs[:-1]
final_msg = output_msgs[-1]
reasoning_content = "\n".join([msg.content[0].text for msg in reasoning_msg])
reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg])
final_content = final_msg.content[0].text
return reasoning_content, final_content, is_tool_call
return reasoning, final_content, is_tool_call
......@@ -2102,7 +2102,15 @@ class ChatMessage(OpenAIBaseModel):
tool_calls: list[ToolCall] = Field(default_factory=list)
# vLLM-specific fields that are not in OpenAI spec
reasoning: str | None = None
reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionLogProb(OpenAIBaseModel):
......@@ -2156,9 +2164,17 @@ class ChatCompletionResponse(OpenAIBaseModel):
class DeltaMessage(OpenAIBaseModel):
role: str | None = None
content: str | None = None
reasoning: str | None = None
reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
tool_calls: list[DeltaToolCall] = Field(default_factory=list)
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
index: int
......
......@@ -759,9 +759,7 @@ class OpenAIServingChat(OpenAIServing):
delta_message = DeltaMessage(content=delta_text)
elif cur_channel == "analysis":
if request.include_reasoning:
delta_message = DeltaMessage(
reasoning_content=delta_text
)
delta_message = DeltaMessage(reasoning=delta_text)
else:
delta_message = None
elif (
......@@ -823,7 +821,7 @@ class OpenAIServingChat(OpenAIServing):
):
assert reasoning_parser is not None
delta_message = (
reasoning_parser.extract_reasoning_content_streaming(
reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
......@@ -836,7 +834,7 @@ class OpenAIServingChat(OpenAIServing):
# or think end id in prompt_token_ids
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Only keep 'content', remove 'reasoning_content'.
# Only keep 'content', remove 'reasoning'.
if reasoning_parser.is_reasoning_end(
as_list(output.token_ids)
) or (
......@@ -899,7 +897,7 @@ class OpenAIServingChat(OpenAIServing):
if self.reasoning_parser and not reasoning_end_arr[i]:
delta_message = (
reasoning_parser.extract_reasoning_content_streaming(
reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
......@@ -948,7 +946,7 @@ class OpenAIServingChat(OpenAIServing):
output_token_ids = as_list(output.token_ids)
if not reasoning_end_arr[i]:
delta_message = (
reasoning_parser.extract_reasoning_content_streaming(
reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
......@@ -961,7 +959,7 @@ class OpenAIServingChat(OpenAIServing):
# i.e {"enable_thinking": False},
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning_content'.
# to 'reasoning'.
if (
res.prompt_token_ids
and reasoning_parser.is_reasoning_end(
......@@ -978,7 +976,7 @@ class OpenAIServingChat(OpenAIServing):
# When encountering think end id in delta_token_ids,
# set reasoning status to end.
# Remove the text and token ids related
# to 'reasoning_content'.
# to 'reasoning'.
if reasoning_parser.is_reasoning_end(output_token_ids):
reasoning_end_arr[i] = True
current_token_ids = (
......@@ -1033,15 +1031,13 @@ class OpenAIServingChat(OpenAIServing):
# when only reasoning
elif self.reasoning_parser:
delta_message = (
reasoning_parser.extract_reasoning_content_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
output.token_ids,
)
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
output.token_ids,
)
# handle streaming just a content delta
else:
......@@ -1334,9 +1330,9 @@ class OpenAIServingChat(OpenAIServing):
logprobs = None
if self.use_harmony:
reasoning_content, content, _ = parse_chat_output(token_ids)
reasoning, content, _ = parse_chat_output(token_ids)
if not request.include_reasoning:
reasoning_content = None
reasoning = None
if self.tool_parser is not None:
tool_parser = self.tool_parser(tokenizer)
......@@ -1349,14 +1345,14 @@ class OpenAIServingChat(OpenAIServing):
content = tool_call_info.content
message = ChatMessage(
role=role,
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content,
tool_calls=tool_call_info.tool_calls,
)
else:
message = ChatMessage(
role=role,
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content,
)
......@@ -1390,13 +1386,13 @@ class OpenAIServingChat(OpenAIServing):
return self.create_error_response(str(e))
# If the reasoning parser is enabled,
# tool calls are extracted exclusively from the content.
reasoning_content, content = reasoning_parser.extract_reasoning_content(
reasoning, content = reasoning_parser.extract_reasoning(
output.text, request=request
)
if not request.include_reasoning:
reasoning_content = None
reasoning = None
else:
reasoning_content = None
reasoning = None
content = output.text
auto_tools_called = False
......@@ -1416,9 +1412,7 @@ class OpenAIServingChat(OpenAIServing):
not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
and request.tool_choice != "required"
):
message = ChatMessage(
role=role, reasoning_content=reasoning_content, content=content
)
message = ChatMessage(role=role, reasoning=reasoning, content=content)
# if the request uses tools and specified a tool choice
elif (
......@@ -1428,7 +1422,7 @@ class OpenAIServingChat(OpenAIServing):
assert tool_calls is not None and len(tool_calls) > 0
message = ChatMessage(
role=role,
reasoning_content=reasoning_content,
reasoning=reasoning,
content="",
tool_calls=[tool_call_class(function=tc) for tc in tool_calls],
)
......@@ -1452,15 +1446,13 @@ class OpenAIServingChat(OpenAIServing):
role=role,
content="",
tool_calls=tool_call_class_items,
reasoning_content=reasoning_content,
reasoning=reasoning,
)
# if the request doesn't use tool choice
# OR specifies to not use a tool
elif not request.tool_choice or request.tool_choice == "none":
message = ChatMessage(
role=role, reasoning_content=reasoning_content, content=content
)
message = ChatMessage(role=role, reasoning=reasoning, content=content)
# handle when there are tools and tool choice is auto
elif (
......@@ -1476,7 +1468,7 @@ class OpenAIServingChat(OpenAIServing):
if tool_calls:
message = ChatMessage(
role=role,
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content,
tool_calls=[
ToolCall(
......@@ -1498,7 +1490,7 @@ class OpenAIServingChat(OpenAIServing):
ret_content = content
message = ChatMessage(
role=role,
reasoning_content=reasoning_content,
reasoning=reasoning,
content=ret_content,
)
......@@ -1509,9 +1501,7 @@ class OpenAIServingChat(OpenAIServing):
" if tools should be extracted. Returning a standard chat "
"completion."
)
message = ChatMessage(
role=role, reasoning_content=reasoning_content, content=content
)
message = ChatMessage(role=role, reasoning=reasoning, content=content)
# In OpenAI's API, when a tool is called, the finish_reason is:
# "tool_calls" for "auto" or "required" tool calls,
# and "stop" for named tool calls.
......
......@@ -778,11 +778,11 @@ class OpenAIServingResponses(OpenAIServing):
logger.exception("Error in reasoning parser creation.")
raise e
reasoning_content, content = reasoning_parser.extract_reasoning_content(
reasoning, content = reasoning_parser.extract_reasoning(
final_output.text, request=request
)
else:
reasoning_content = None
reasoning = None
content = final_output.text
# Log complete response if output logging is enabled
......@@ -790,8 +790,8 @@ class OpenAIServingResponses(OpenAIServing):
output_text = ""
if content:
output_text = content
elif reasoning_content:
output_text = f"[reasoning: {reasoning_content}]"
elif reasoning:
output_text = f"[reasoning: {reasoning}]"
if output_text:
self.request_logger.log_outputs(
......@@ -805,15 +805,13 @@ class OpenAIServingResponses(OpenAIServing):
reasoning_item = None
message_item = None
if reasoning_content:
if reasoning:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(
text=reasoning_content, type="reasoning_text"
)
ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
],
status=None, # NOTE: Only the last output item has status.
)
......@@ -1208,15 +1206,13 @@ class OpenAIServingResponses(OpenAIServing):
if ctx.last_output.outputs:
output = ctx.last_output.outputs[0]
if reasoning_parser:
delta_message = (
reasoning_parser.extract_reasoning_content_streaming(
previous_text=previous_text,
current_text=previous_text + output.text,
delta_text=output.text,
previous_token_ids=previous_token_ids,
current_token_ids=previous_token_ids + output.token_ids,
delta_token_ids=output.token_ids,
)
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text=previous_text,
current_text=previous_text + output.text,
delta_text=output.text,
previous_token_ids=previous_token_ids,
current_token_ids=previous_token_ids + output.token_ids,
delta_token_ids=output.token_ids,
)
else:
delta_message = DeltaMessage(
......@@ -1228,7 +1224,7 @@ class OpenAIServingResponses(OpenAIServing):
continue
if not first_delta_sent:
current_item_id = str(uuid.uuid4())
if delta_message.reasoning_content:
if delta_message.reasoning:
yield _increment_sequence_number_and_return(
ResponseOutputItemAddedEvent(
type="response.output_item.added",
......@@ -1280,15 +1276,15 @@ class OpenAIServingResponses(OpenAIServing):
# same as content or reasoning content
if (
previous_delta_messages
and previous_delta_messages[-1].reasoning_content is not None
and previous_delta_messages[-1].reasoning is not None
and delta_message.content is not None
):
# from reasoning to normal content, send done
# event for reasoning
reason_content = "".join(
pm.reasoning_content
pm.reasoning
for pm in previous_delta_messages
if pm.reasoning_content is not None
if pm.reasoning is not None
)
yield _increment_sequence_number_and_return(
ResponseReasoningTextDoneEvent(
......@@ -1356,7 +1352,7 @@ class OpenAIServingResponses(OpenAIServing):
# reset previous delta messages
previous_delta_messages = []
if delta_message.reasoning_content is not None:
if delta_message.reasoning is not None:
yield _increment_sequence_number_and_return(
ResponseReasoningTextDeltaEvent(
type="response.reasoning_text.delta",
......@@ -1364,7 +1360,7 @@ class OpenAIServingResponses(OpenAIServing):
content_index=current_content_index,
output_index=current_output_index,
item_id=current_item_id,
delta=delta_message.reasoning_content,
delta=delta_message.reasoning,
)
)
elif delta_message.content is not None:
......@@ -1392,11 +1388,11 @@ class OpenAIServingResponses(OpenAIServing):
previous_delta_messages.append(delta_message)
if previous_delta_messages:
if previous_delta_messages[-1].reasoning_content is not None:
if previous_delta_messages[-1].reasoning is not None:
reason_content = "".join(
pm.reasoning_content
pm.reasoning
for pm in previous_delta_messages
if pm.reasoning_content is not None
if pm.reasoning is not None
)
yield _increment_sequence_number_and_return(
ResponseReasoningTextDoneEvent(
......
......@@ -279,7 +279,7 @@ class StreamingXMLToolCallParser:
final_delta = DeltaMessage(
role=None,
content=None,
reasoning_content=None,
reasoning=None,
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
......
......@@ -76,7 +76,7 @@ class ReasoningParser:
"""
@abstractmethod
def extract_reasoning_content(
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
......@@ -100,7 +100,7 @@ class ReasoningParser:
"""
@abstractmethod
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......
......@@ -76,7 +76,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
else:
return input_ids[input_ids.index(self.end_token_id) + 1 :]
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -103,11 +103,10 @@ class BaseThinkingReasoningParser(ReasoningParser):
# start token in previous, end token in delta,
# extract reasoning content
end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:end_index]
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning_content=reasoning_content,
content=content if content else None,
reasoning=reasoning, content=content if content else None
)
elif self.end_token_id in previous_token_ids:
# start token in previous, end token in previous,
......@@ -116,30 +115,27 @@ class BaseThinkingReasoningParser(ReasoningParser):
else:
# start token in previous, no end token in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
elif self.start_token_id in delta_token_ids:
if self.end_token_id in delta_token_ids:
# start token in delta, end token in delta,
# extract reasoning content
start_index = delta_text.find(self.start_token)
end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[
start_index + len(self.start_token) : end_index
]
reasoning = delta_text[start_index + len(self.start_token) : end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning_content=reasoning_content,
content=content if content else None,
reasoning=reasoning, content=content if content else None
)
else:
# start token in delta, no end token in delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
else:
# not find thinking start token
return DeltaMessage(content=delta_text)
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
"""
......@@ -160,7 +156,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
if self.end_token not in model_output:
return model_output, None
else:
reasoning_content, _, content = model_output.partition(self.end_token)
reasoning, _, content = model_output.partition(self.end_token)
# If generation stops right after end-of-think, return null content
final_content = content or None
return reasoning_content, final_content
return reasoning, final_content
......@@ -25,7 +25,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -34,7 +34,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
ret = super().extract_reasoning_content_streaming(
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
......@@ -51,10 +51,10 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
# end token in delta with more tokens,
# extract reasoning content and content
end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:end_index]
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content if content else None,
)
elif self.end_token_id in previous_token_ids:
......@@ -62,6 +62,6 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
return DeltaMessage(content=delta_text)
else:
# no end token in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
return ret
......@@ -38,12 +38,12 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning_content(model_output, request)
return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -52,7 +52,7 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_content_streaming(
return self._parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
......
......@@ -57,7 +57,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
"tokens in the tokenizer!"
)
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
The Ernie45 thinking model ouput format is
abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning_content
- 'abc' goes to reasoning
- 'def' goes to content
"""
# Skip single special tokens
......@@ -94,7 +94,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
# </think> in delta with more tokens,
# extract reasoning content and content
think_end_index = delta_text.find(self.end_token)
reasoning_content = delta_text[:think_end_index]
reasoning = delta_text[:think_end_index]
content = delta_text[think_end_index + len(self.end_token) :]
content = content.lstrip("\n")
response_start_idx = content.find(self.response_start_token)
......@@ -104,7 +104,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
if response_end_idx != -1:
content = content[:response_end_idx]
return DeltaMessage(
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content if content else None,
)
elif self.end_token_id in previous_token_ids:
......@@ -138,9 +138,9 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
return DeltaMessage(content=content if content else None)
else:
# no </think> in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""
......@@ -148,14 +148,12 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
The Ernie45 thinking model ouput format is
abc\n</think>\n\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning_content
- 'abc' goes to reasoning
- 'def' goes to content
Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
"""
reasoning_content, content = super().extract_reasoning_content(
model_output, request
)
reasoning, content = super().extract_reasoning(model_output, request)
if content:
start_idx = content.find(self.response_start_token)
end_idx = content.rfind(self.response_end_token)
......@@ -164,4 +162,4 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
content = content[start_idx + len(self.response_start_token) : end_idx]
final_content = content or None
return reasoning_content, final_content
return reasoning, final_content
......@@ -70,7 +70,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
else:
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -84,7 +84,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
For text <think>abc</think>xyz:
- 'abc' goes to reasoning_content
- 'abc' goes to reasoning
- 'xyz' goes to content
"""
# Skip single special tokens
......@@ -98,10 +98,10 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
# <think> in previous, </think> in delta,
# extract reasoning content
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index]
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage(
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content if content else None,
)
elif self.think_end_token_id in previous_token_ids:
......@@ -111,36 +111,36 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
else:
# <think> in previous, no </think> in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[
reasoning = delta_text[
start_index + len(self.think_start_token) : end_index
]
content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage(
reasoning_content=reasoning_content,
reasoning=reasoning,
content=content if content else None,
)
else:
# <think> in delta, no </think> in delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
return DeltaMessage(reasoning=delta_text)
else:
# thinking is disabled, just content
return DeltaMessage(content=delta_text)
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
For text <think>abc</think>xyz:
- 'abc' goes to reasoning_content
- 'abc' goes to reasoning
- 'xyz' goes to content
Returns:
......@@ -165,7 +165,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
return None, model_output
# Extract reasoning content from the model output.
reasoning_content, _, content = model_output.partition(self.think_end_token)
reasoning, _, content = model_output.partition(self.think_end_token)
final_content = content or None
return reasoning_content, final_content
return reasoning, final_content
......@@ -104,7 +104,7 @@ class GptOssReasoningParser(ReasoningParser):
return []
return self.model_tokenizer.encode(content)
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -131,9 +131,9 @@ class GptOssReasoningParser(ReasoningParser):
content_delta = cur_content
if reasoning_delta is None and content_delta is None:
return None
return DeltaMessage(reasoning_content=reasoning_delta, content=content_delta)
return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
def extract_reasoning_content(
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest,
......
......@@ -49,7 +49,7 @@ class GraniteReasoningParser(ReasoningParser):
len(think_start) for think_start in self.valid_think_starts
)
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively.
......@@ -67,12 +67,12 @@ class GraniteReasoningParser(ReasoningParser):
re_match = self.reasoning_regex.findall(model_output)
if not re_match:
return None, model_output
reasoning_content, response_content = re_match[0]
reasoning, response_content = re_match[0]
if not response_content:
return reasoning_content, None
return reasoning_content, response_content
return reasoning, None
return reasoning, response_content
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -107,12 +107,10 @@ class GraniteReasoningParser(ReasoningParser):
Union[DeltaMessage, None]
DeltaMessage with either reasoning content or content, or None.
"""
reasoning_content, resp_seq_len, content = self._get_content_sections(
current_text
)
reasoning, resp_seq_len, content = self._get_content_sections(current_text)
# Either we haven't finished the start of the reasoning sequence,
# or the model is generating something unexpected.
if not reasoning_content:
if not reasoning:
delta_message = self._get_delta_message_with_no_reasoning_bounds(
current_text, delta_text
)
......@@ -120,16 +118,16 @@ class GraniteReasoningParser(ReasoningParser):
# the start of response sequence.
elif not content:
delta_message = self._get_delta_message_with_no_response_bounds(
current_text, reasoning_content, delta_text
current_text, reasoning, delta_text
)
# We've finished both the start of reasoning and start of response seq.
else:
# This should never happen since we matched on the response
assert resp_seq_len is not None
delta_message = self._get_delta_message_with_both_bounds(
delta_text, reasoning_content, content, current_text, resp_seq_len
delta_text, reasoning, content, current_text, resp_seq_len
)
if not delta_message.content and not delta_message.reasoning_content:
if not delta_message.content and not delta_message.reasoning:
return None
return delta_message
......@@ -185,20 +183,20 @@ class GraniteReasoningParser(ReasoningParser):
# message and append everything to content in the future.
if was_substr and not is_substr:
return DeltaMessage(
reasoning_content=None,
reasoning=None,
content=current_text,
)
if is_substr:
# Might still be in the special token sequence; return nothing
return DeltaMessage(reasoning_content=None, content=None)
return DeltaMessage(reasoning=None, content=None)
# Otherwise the sequence has already been broken and we already
# corrected; just return the delta text as normal content.
return DeltaMessage(reasoning_content=None, content=delta_text)
return DeltaMessage(reasoning=None, content=delta_text)
def _get_delta_message_with_no_response_bounds(
self,
current_text: str,
reasoning_content: str,
reasoning: str,
delta_text: str,
) -> DeltaMessage:
"""Parse the delta message when the current text has both reasoning
......@@ -208,7 +206,7 @@ class GraniteReasoningParser(ReasoningParser):
Args:
current_text (str): The full previous + delta text.
reasoning_content (str): reasoning content from current_text.
reasoning (str): reasoning content from current_text.
delta_text (str): Text to consider and parse content from.
Returns:
......@@ -222,12 +220,12 @@ class GraniteReasoningParser(ReasoningParser):
current_text.endswith(response_start)
for response_start in self.valid_response_starts
)
if reasoning_content is None or ends_with_start_response_seq:
return DeltaMessage(reasoning_content=None, content=None)
if reasoning is None or ends_with_start_response_seq:
return DeltaMessage(reasoning=None, content=None)
# Consider previous / current text only within context of the reasoning
previous_text = reasoning_content[: -len(delta_text)]
current_text = reasoning_content
previous_text = reasoning[: -len(delta_text)]
current_text = reasoning
# We need to be careful about adding unfinished response sequences;
# Find the place at which we MIGHT be starting a response sequence
......@@ -253,32 +251,30 @@ class GraniteReasoningParser(ReasoningParser):
# Delta only contains potential continued response sequence text.
if delta_continues_substr:
return DeltaMessage(reasoning_content=None, content=None)
return DeltaMessage(reasoning=None, content=None)
if not prev_was_substr:
# Delta may be starting a new response seq but has other text too.
if delta_new_substr:
return DeltaMessage(
reasoning_content=delta_text[:delta_idx], content=None
)
return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
# Normal case for most reasoning text (no potential special seqs).
return DeltaMessage(reasoning_content=delta_text, content=None)
return DeltaMessage(reasoning=delta_text, content=None)
# The substring that previously seemed to be a potential response
# seq wasn't one; we need to add the content to the delta message,
# and also slice off the potential response sequence
elif delta_new_substr:
reasoning_content = previous_text[prev_idx:] + delta_text[:delta_idx]
return DeltaMessage(reasoning_content=reasoning_content, content=None)
reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
return DeltaMessage(reasoning=reasoning, content=None)
# No new substring yet, and we broke our old one; take the whole delta
return DeltaMessage(
reasoning_content=previous_text[prev_idx:] + delta_text,
reasoning=previous_text[prev_idx:] + delta_text,
content=None,
)
def _get_delta_message_with_both_bounds(
self,
delta_text: str,
reasoning_content: str,
reasoning: str,
response_content: str,
current_text: str,
response_seq_len: int,
......@@ -288,7 +284,7 @@ class GraniteReasoningParser(ReasoningParser):
Args:
delta_text: Text to consider and parse content from.
reasoning_content: reasoning content from current_text.
reasoning: reasoning content from current_text.
response_content: response content from current_text.
current_text: The full previous + delta text.
response_seq_len: Len of the complete response sequence used.
......@@ -301,20 +297,20 @@ class GraniteReasoningParser(ReasoningParser):
reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
if reasoning_end_idx < 0:
delta_reasoning_content = None
delta_reasoning = None
else:
# Get the starting offset
start_reasoning_content_idx = (
len(reasoning_content) + response_seq_len + len(response_content) - 1
start_reasoning_idx = (
len(reasoning) + response_seq_len + len(response_content) - 1
)
delta_offset = len(current_text) - len(delta_text)
start_offset = start_reasoning_content_idx - delta_offset
start_offset = start_reasoning_idx - delta_offset
if start_offset < 0:
start_offset = 0
delta_reasoning_content = delta_text[start_offset:reasoning_end_idx]
delta_reasoning = delta_text[start_offset:reasoning_end_idx]
return DeltaMessage(
reasoning_content=delta_reasoning_content,
reasoning=delta_reasoning,
content=delta_content,
)
......@@ -333,7 +329,7 @@ class GraniteReasoningParser(ReasoningParser):
(if there is one) and the non-reasoning content.
"""
current_chunk_start = 0
start_reasoning_content = None
start_reasoning = None
parsed_content = False
delimiter_idxs = [
idx
......@@ -344,10 +340,10 @@ class GraniteReasoningParser(ReasoningParser):
for current_chunk_end in delimiter_idxs:
current_chunk = current_text[current_chunk_start:current_chunk_end]
# Check to see if the start of reasoning seq if complete
if start_reasoning_content is None:
if start_reasoning is None:
for think_start in self.valid_think_starts:
if current_chunk == think_start[:-1]:
start_reasoning_content = current_chunk_end + 1
start_reasoning = current_chunk_end + 1
current_chunk_start = current_chunk_end + 1
break
......@@ -357,13 +353,11 @@ class GraniteReasoningParser(ReasoningParser):
if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
# Mark end of reasoning and start response content
# after the start of response sequence.
end_reasoning_content = current_chunk_end - len(response_start)
reasoning_content = current_text[
start_reasoning_content:end_reasoning_content
]
end_reasoning = current_chunk_end - len(response_start)
reasoning = current_text[start_reasoning:end_reasoning]
response_content = current_text[current_chunk_end + 1 :]
return reasoning_content, len(response_start), response_content
return reasoning, len(response_start), response_content
if start_reasoning_content and not parsed_content:
return current_text[start_reasoning_content:], None, None
if start_reasoning and not parsed_content:
return current_text[start_reasoning:], None, None
return None, None, None
......@@ -86,7 +86,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
# this id is not part of content, so just return [] here.
return []
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively.
......@@ -104,27 +104,27 @@ class HunyuanA13BReasoningParser(ReasoningParser):
re_match = self.full_match_reasoning_regex.findall(model_output)
if re_match:
reasoning_content, response_content = re_match[0]
if len(reasoning_content) == 0:
reasoning_content = None
reasoning, response_content = re_match[0]
if len(reasoning) == 0:
reasoning = None
if len(response_content) == 0:
response_content = None
return reasoning_content, response_content
return reasoning, response_content
fallback_regex = self.half_match_reasoning_regex
fallback_match = fallback_regex.findall(model_output)
if fallback_match:
reasoning_content, response_content = fallback_match[0]
reasoning, response_content = fallback_match[0]
if response_content.endswith(self.response_end_expr):
response_content = response_content[: -len(self.response_end_expr)]
if len(reasoning_content) == 0:
reasoning_content = None
if len(reasoning) == 0:
reasoning = None
if len(response_content) == 0:
response_content = None
return reasoning_content, response_content
return reasoning, response_content
return None, model_output
......@@ -140,7 +140,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
sub_idx += 1
return sub_idx == len(subsequence)
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -223,19 +223,15 @@ class HunyuanA13BReasoningParser(ReasoningParser):
# Return content based on current state
if self.current_state == "think":
return DeltaMessage(
reasoning_content=buffered_content, content=None
)
return DeltaMessage(reasoning=buffered_content, content=None)
else:
return DeltaMessage(
reasoning_content=None, content=buffered_content
)
return DeltaMessage(reasoning=None, content=buffered_content)
else:
# No buffered content, send normally
if self.current_state == "think":
return DeltaMessage(reasoning_content=delta_text, content=None)
return DeltaMessage(reasoning=delta_text, content=None)
else:
return DeltaMessage(reasoning_content=None, content=delta_text)
return DeltaMessage(reasoning=None, content=delta_text)
# If no content to send in this delta
return None
......@@ -36,7 +36,7 @@ class IdentityReasoningParser(ReasoningParser):
# Identity: return all tokens as content
return input_ids
def extract_reasoning_content_streaming(
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
......@@ -50,9 +50,9 @@ class IdentityReasoningParser(ReasoningParser):
return DeltaMessage(content=delta_text)
return None
def extract_reasoning_content(
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
# No reasoning separation: return None for reasoning_content,
# No reasoning separation: return None for reasoning,
# and full model_output as content
return None, model_output
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment