Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4815b00f
Unverified
Commit
4815b00f
authored
Aug 07, 2025
by
Chen Zhang
Committed by
GitHub
Aug 07, 2025
Browse files
[gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)
Signed-off-by:
Chen Zhang
<
zhangch99@outlook.com
>
parent
4da8bf20
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
292 additions
and
82 deletions
+292
-82
tests/v1/entrypoints/openai/responses/test_basic.py
tests/v1/entrypoints/openai/responses/test_basic.py
+1
-1
vllm/entrypoints/harmony_utils.py
vllm/entrypoints/harmony_utils.py
+150
-3
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+19
-12
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+122
-66
No files found.
tests/v1/entrypoints/openai/responses/test_basic.py
View file @
4815b00f
...
@@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
...
@@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
# Whether the output contains the reasoning.
# Whether the output contains the reasoning.
assert
outputs
[
0
].
type
==
"reasoning"
assert
outputs
[
0
].
type
==
"reasoning"
assert
outputs
[
0
].
text
!=
""
assert
outputs
[
0
].
content
[
0
].
text
!=
""
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
vllm/entrypoints/harmony_utils.py
View file @
4815b00f
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
datetime
import
datetime
import
json
from
collections.abc
import
Iterable
,
Sequence
from
collections.abc
import
Iterable
,
Sequence
from
typing
import
Literal
,
Optional
,
Union
from
typing
import
Literal
,
Optional
,
Union
from
openai.types.responses
import
ResponseFunctionToolCall
,
ResponseOutputItem
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
ResponseOutputItem
,
ResponseOutputMessage
,
ResponseOutputText
,
ResponseReasoningItem
)
from
openai.types.responses.response_function_web_search
import
(
ActionFind
,
ActionOpenPage
,
ActionSearch
,
ResponseFunctionWebSearch
)
from
openai.types.responses.response_reasoning_item
import
(
Content
as
ResponseReasoningTextContent
)
from
openai.types.responses.tool
import
Tool
from
openai.types.responses.tool
import
Tool
from
openai_harmony
import
(
Author
,
Conversation
,
DeveloperContent
,
from
openai_harmony
import
(
Author
,
Conversation
,
DeveloperContent
,
HarmonyEncodingName
,
Message
,
ReasoningEffort
,
HarmonyEncodingName
,
Message
,
ReasoningEffort
,
Role
,
StreamableParser
,
SystemContent
,
TextContent
,
Role
,
StreamableParser
,
SystemContent
,
TextContent
,
ToolDescription
,
load_harmony_encoding
)
ToolDescription
,
load_harmony_encoding
)
from
vllm.entrypoints.openai.protocol
import
(
ResponseInputOutputItem
,
from
vllm.entrypoints.openai.protocol
import
ResponseInputOutputItem
ResponseReasoningItem
)
from
vllm.utils
import
random_uuid
REASONING_EFFORT
=
{
REASONING_EFFORT
=
{
"high"
:
ReasoningEffort
.
HIGH
,
"high"
:
ReasoningEffort
.
HIGH
,
...
@@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
...
@@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
return
token_ids
return
token_ids
def
parse_output_message
(
message
:
Message
)
->
list
[
ResponseOutputItem
]:
"""
Parse a Harmony message into a list of output response items.
"""
if
message
.
author
.
role
!=
"assistant"
:
# This is a message from a tool to the assistant (e.g., search result).
# Don't include it in the final output for now. This aligns with
# OpenAI's behavior on models like o4-mini.
return
[]
output_items
:
list
[
ResponseOutputItem
]
=
[]
recipient
=
message
.
recipient
if
recipient
is
not
None
and
recipient
.
startswith
(
"browser."
):
if
len
(
message
.
content
)
!=
1
:
raise
ValueError
(
"Invalid number of contents in browser message"
)
content
=
message
.
content
[
0
]
browser_call
=
json
.
loads
(
content
.
text
)
# TODO: translate to url properly!
if
recipient
==
"browser.search"
:
action
=
ActionSearch
(
query
=
f
"cursor:
{
browser_call
.
get
(
'query'
,
''
)
}
"
,
type
=
"search"
)
elif
recipient
==
"browser.open"
:
action
=
ActionOpenPage
(
url
=
f
"cursor:
{
browser_call
.
get
(
'url'
,
''
)
}
"
,
type
=
"open_page"
)
elif
recipient
==
"browser.find"
:
action
=
ActionFind
(
pattern
=
browser_call
[
"pattern"
],
url
=
f
"cursor:
{
browser_call
.
get
(
'url'
,
''
)
}
"
,
type
=
"find"
)
else
:
raise
ValueError
(
f
"Unknown browser action:
{
recipient
}
"
)
web_search_item
=
ResponseFunctionWebSearch
(
id
=
f
"ws_
{
random_uuid
()
}
"
,
action
=
action
,
status
=
"completed"
,
type
=
"web_search_call"
,
)
output_items
.
append
(
web_search_item
)
elif
message
.
channel
==
"analysis"
:
for
content
in
message
.
content
:
reasoning_item
=
ResponseReasoningItem
(
id
=
f
"rs_
{
random_uuid
()
}
"
,
summary
=
[],
type
=
"reasoning"
,
content
=
[
ResponseReasoningTextContent
(
text
=
content
.
text
,
type
=
"reasoning_text"
)
],
status
=
None
,
)
output_items
.
append
(
reasoning_item
)
elif
message
.
channel
==
"commentary"
:
if
message
.
recipient
.
startswith
(
"functions."
):
function_name
=
message
.
recipient
.
split
(
"."
)[
-
1
]
for
content
in
message
.
content
:
random_id
=
random_uuid
()
response_item
=
ResponseFunctionToolCall
(
arguments
=
content
.
text
,
call_id
=
f
"call_
{
random_id
}
"
,
type
=
"function_call"
,
name
=
function_name
,
id
=
f
"ft_
{
random_id
}
"
,
)
output_items
.
append
(
response_item
)
elif
message
.
recipient
.
startswith
(
"python"
)
or
message
.
recipient
.
startswith
(
"browser"
):
for
content
in
message
.
content
:
reasoning_item
=
ResponseReasoningItem
(
id
=
f
"rs_
{
random_uuid
()
}
"
,
summary
=
[],
type
=
"reasoning"
,
text
=
content
.
text
,
status
=
None
,
)
output_items
.
append
(
reasoning_item
)
else
:
raise
ValueError
(
f
"Unknown recipient:
{
message
.
recipient
}
"
)
elif
message
.
channel
==
"final"
:
contents
=
[]
for
content
in
message
.
content
:
output_text
=
ResponseOutputText
(
text
=
content
.
text
,
annotations
=
[],
# TODO
type
=
"output_text"
,
logprobs
=
None
,
# TODO
)
contents
.
append
(
output_text
)
text_item
=
ResponseOutputMessage
(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
contents
,
role
=
message
.
author
.
role
,
status
=
"completed"
,
type
=
"message"
,
)
output_items
.
append
(
text_item
)
else
:
raise
ValueError
(
f
"Unknown channel:
{
message
.
channel
}
"
)
return
output_items
def
parse_remaining_state
(
parser
:
StreamableParser
)
->
list
[
ResponseOutputItem
]:
if
not
parser
.
current_content
:
return
[]
if
parser
.
current_role
!=
Role
.
ASSISTANT
:
return
[]
current_recipient
=
parser
.
current_recipient
if
(
current_recipient
is
not
None
and
current_recipient
.
startswith
(
"browser."
)):
return
[]
if
parser
.
current_channel
==
"analysis"
:
reasoning_item
=
ResponseReasoningItem
(
id
=
f
"rs_
{
random_uuid
()
}
"
,
summary
=
[],
type
=
"reasoning"
,
content
=
[
ResponseReasoningTextContent
(
text
=
parser
.
current_content
,
type
=
"reasoning_text"
)
],
status
=
None
,
)
return
[
reasoning_item
]
elif
parser
.
current_channel
==
"final"
:
output_text
=
ResponseOutputText
(
text
=
parser
.
current_content
,
annotations
=
[],
# TODO
type
=
"output_text"
,
logprobs
=
None
,
# TODO
)
text_item
=
ResponseOutputMessage
(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
[
output_text
],
role
=
"assistant"
,
status
=
"completed"
,
type
=
"message"
,
)
return
[
text_item
]
return
[]
def
get_stop_tokens_for_assistant_actions
()
->
list
[
int
]:
def
get_stop_tokens_for_assistant_actions
()
->
list
[
int
]:
return
get_encoding
().
stop_tokens_for_assistant_actions
()
return
get_encoding
().
stop_tokens_for_assistant_actions
()
...
...
vllm/entrypoints/openai/protocol.py
View file @
4815b00f
...
@@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
...
@@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
# yapf: enable
# yapf: enable
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
ResponseInputItemParam
,
ResponseOutputItem
,
ResponseInputItemParam
,
ResponseOutputItem
,
Response
OutputMessage
,
Response
Prompt
,
Response
Prompt
,
Response
Status
,
ResponseStatus
,
ResponseTextConfig
)
ResponseTextConfig
)
from
openai.types.responses.response
import
ToolChoice
from
openai.types.responses.response
import
ToolChoice
from
openai.types.responses.tool
import
Tool
from
openai.types.responses.tool
import
Tool
from
openai.types.shared
import
Metadata
,
Reasoning
from
openai.types.shared
import
Metadata
,
Reasoning
...
@@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
...
@@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
class
ResponseReasoningItem
(
OpenAIBaseModel
):
class
InputTokensDetails
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"rs_
{
random_uuid
()
}
"
)
cached_tokens
:
int
text
:
str
summary
:
list
=
Field
(
default_factory
=
list
)
type
:
Literal
[
"reasoning"
]
=
"reasoning"
class
OutputTokensDetails
(
OpenAIBaseModel
):
encrypted_content
:
Optional
[
str
]
=
None
reasoning_tokens
:
int
status
:
Optional
[
Literal
[
"in_progress"
,
"completed"
,
"incomplete"
]]
class
ResponseUsage
(
OpenAIBaseModel
):
input_tokens
:
int
input_tokens_details
:
InputTokensDetails
output_tokens
:
int
output_tokens_details
:
OutputTokensDetails
total_tokens
:
int
class
ResponsesResponse
(
OpenAIBaseModel
):
class
ResponsesResponse
(
OpenAIBaseModel
):
...
@@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
...
@@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
metadata
:
Optional
[
Metadata
]
=
None
metadata
:
Optional
[
Metadata
]
=
None
model
:
str
model
:
str
object
:
Literal
[
"response"
]
=
"response"
object
:
Literal
[
"response"
]
=
"response"
output
:
list
[
Union
[
ResponseOutput
Message
,
ResponseReasoning
Item
]
]
output
:
list
[
ResponseOutputItem
]
parallel_tool_calls
:
bool
parallel_tool_calls
:
bool
temperature
:
float
temperature
:
float
tool_choice
:
ToolChoice
tool_choice
:
ToolChoice
...
@@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
...
@@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
text
:
Optional
[
ResponseTextConfig
]
=
None
text
:
Optional
[
ResponseTextConfig
]
=
None
top_logprobs
:
int
top_logprobs
:
int
truncation
:
Literal
[
"auto"
,
"disabled"
]
truncation
:
Literal
[
"auto"
,
"disabled"
]
usage
:
Optional
[
Usage
Info
]
=
None
usage
:
Optional
[
Response
Usage
]
=
None
user
:
Optional
[
str
]
=
None
user
:
Optional
[
str
]
=
None
@
classmethod
@
classmethod
...
@@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
...
@@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
created_time
:
int
,
created_time
:
int
,
output
:
list
[
ResponseOutputItem
],
output
:
list
[
ResponseOutputItem
],
status
:
ResponseStatus
,
status
:
ResponseStatus
,
usage
:
Optional
[
Usage
Info
]
=
None
,
usage
:
Optional
[
Response
Usage
]
=
None
,
)
->
"ResponsesResponse"
:
)
->
"ResponsesResponse"
:
return
cls
(
return
cls
(
id
=
request
.
request_id
,
id
=
request
.
request_id
,
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
4815b00f
...
@@ -6,12 +6,15 @@ import time
...
@@ -6,12 +6,15 @@ import time
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
copy
import
copy
from
copy
import
copy
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Callable
,
Final
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Final
,
Optional
,
Union
import
jinja2
import
jinja2
from
fastapi
import
Request
from
fastapi
import
Request
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
from
openai.types.responses
import
(
ResponseFunctionToolCall
,
ResponseOutputMessage
,
ResponseOutputText
)
ResponseOutputItem
,
ResponseOutputMessage
,
ResponseOutputText
,
ResponseReasoningItem
)
from
openai.types.responses.response_reasoning_item
import
(
Content
as
ResponseReasoningTextContent
)
from
openai_harmony
import
Message
as
OpenAIHarmonyMessage
from
openai_harmony
import
Message
as
OpenAIHarmonyMessage
from
vllm
import
envs
from
vllm
import
envs
...
@@ -19,26 +22,28 @@ from vllm.config import ModelConfig
...
@@ -19,26 +22,28 @@ from vllm.config import ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
ChatTemplateContentFormatOption
)
ChatTemplateContentFormatOption
)
from
vllm.entrypoints.context
import
ConversationContext
,
SimpleContext
from
vllm.entrypoints.context
import
(
ConversationContext
,
HarmonyContext
,
SimpleContext
,
StreamingHarmonyContext
)
from
vllm.entrypoints.harmony_utils
import
(
from
vllm.entrypoints.harmony_utils
import
(
get_developer_message
,
get_stop_tokens_for_assistant_actions
,
get_developer_message
,
get_stop_tokens_for_assistant_actions
,
get_system_message
,
get_user_message
,
parse_
response_input
,
get_system_message
,
get_user_message
,
parse_
output_message
,
render_for_completion
)
parse_remaining_state
,
parse_response_input
,
render_for_completion
)
from
vllm.entrypoints.logger
import
RequestLogger
from
vllm.entrypoints.logger
import
RequestLogger
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
# yapf: disable
# yapf: disable
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
from
vllm.entrypoints.openai.protocol
import
(
ErrorResponse
,
PromptTokenUsageInfo
,
InputTokensDetails
,
OutputTokensDetails
,
RequestResponseMetadata
,
RequestResponseMetadata
,
ResponseReasoningItem
,
ResponsesRequest
,
ResponsesRequest
,
ResponsesResponse
,
Usage
Info
)
ResponsesResponse
,
Response
Usage
)
# yapf: enable
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.outputs
import
CompletionOutput
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
...
@@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
list
[
AsyncGenerator
[
ConversationContext
,
None
]]
=
[]
generators
:
list
[
AsyncGenerator
[
ConversationContext
,
None
]]
=
[]
try
:
try
:
tool_sessions
:
dict
[
str
,
Any
]
=
{}
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
default_max_tokens
=
self
.
max_model_len
-
len
(
default_max_tokens
=
self
.
max_model_len
-
len
(
engine_prompt
[
"prompt_token_ids"
])
engine_prompt
[
"prompt_token_ids"
])
...
@@ -231,6 +237,14 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -231,6 +237,14 @@ class OpenAIServingResponses(OpenAIServing):
trace_headers
=
(
None
if
raw_request
is
None
else
await
trace_headers
=
(
None
if
raw_request
is
None
else
await
self
.
_get_trace_headers
(
raw_request
.
headers
))
self
.
_get_trace_headers
(
raw_request
.
headers
))
context
:
ConversationContext
if
self
.
use_harmony
:
if
request
.
stream
:
context
=
StreamingHarmonyContext
(
messages
,
tool_sessions
)
else
:
context
=
HarmonyContext
(
messages
,
tool_sessions
)
else
:
context
=
SimpleContext
()
context
=
SimpleContext
()
generator
=
self
.
_generate_with_builtin_tools
(
generator
=
self
.
_generate_with_builtin_tools
(
request_id
=
request
.
request_id
,
request_id
=
request
.
request_id
,
...
@@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
request
,
request
,
sampling_params
,
sampling_params
,
result_generator
,
result_generator
,
context
,
model_name
,
model_name
,
tokenizer
,
tokenizer
,
request_metadata
,
request_metadata
,
...
@@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
request
,
request
,
sampling_params
,
sampling_params
,
result_generator
,
result_generator
,
context
,
model_name
,
model_name
,
tokenizer
,
tokenizer
,
request_metadata
,
request_metadata
,
...
@@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
request
:
ResponsesRequest
,
request
:
ResponsesRequest
,
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
result_generator
:
AsyncIterator
[
ConversationContext
],
result_generator
:
AsyncIterator
[
ConversationContext
],
context
:
ConversationContext
,
model_name
:
str
,
model_name
:
str
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
request_metadata
:
RequestResponseMetadata
,
...
@@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
if
created_time
is
None
:
if
created_time
is
None
:
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
time
())
context
:
Optional
[
ConversationContext
]
=
None
try
:
try
:
async
for
context
in
result_generator
:
async
for
_
in
result_generator
:
pass
pass
except
asyncio
.
CancelledError
:
except
asyncio
.
CancelledError
:
return
self
.
create_error_response
(
"Client disconnected"
)
return
self
.
create_error_response
(
"Client disconnected"
)
...
@@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
assert
context
is
not
None
if
self
.
use_harmony
:
assert
isinstance
(
context
,
HarmonyContext
)
output
=
self
.
_make_response_output_items_with_harmony
(
context
)
# TODO: these are all 0 for now!
num_prompt_tokens
=
context
.
num_prompt_tokens
num_generated_tokens
=
context
.
num_output_tokens
num_cached_tokens
=
context
.
num_cached_tokens
num_reasoning_tokens
=
context
.
num_reasoning_tokens
else
:
assert
isinstance
(
context
,
SimpleContext
)
assert
isinstance
(
context
,
SimpleContext
)
final_res
=
context
.
last_output
final_res
=
context
.
last_output
assert
final_res
is
not
None
assert
final_res
is
not
None
assert
len
(
final_res
.
outputs
)
==
1
assert
len
(
final_res
.
outputs
)
==
1
final_output
=
final_res
.
outputs
[
0
]
final_output
=
final_res
.
outputs
[
0
]
if
self
.
reasoning_parser
:
output
=
self
.
_make_response_output_items
(
request
,
final_output
,
try
:
tokenizer
)
reasoning_parser
=
self
.
reasoning_parser
(
tokenizer
)
except
RuntimeError
as
e
:
logger
.
exception
(
"Error in reasoning parser creation."
)
return
self
.
create_error_response
(
str
(
e
))
reasoning_content
,
content
=
(
reasoning_parser
.
extract_reasoning_content
(
final_output
.
text
,
request
=
request
))
else
:
reasoning_content
=
None
content
=
final_output
.
text
output
=
[]
if
reasoning_content
:
reasoning_item
=
ResponseReasoningItem
(
text
=
reasoning_content
,
status
=
None
,
# NOTE: Only the last output item has status.
)
output
.
append
(
reasoning_item
)
if
content
:
output_text
=
ResponseOutputText
(
text
=
content
,
annotations
=
[],
# TODO
type
=
"output_text"
,
logprobs
=
None
,
# TODO
)
message
=
ResponseOutputMessage
(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
[
output_text
],
role
=
"assistant"
,
status
=
"completed"
,
type
=
"message"
,
)
output
.
append
(
message
)
# Calculate usage.
# Calculate usage.
assert
final_res
.
prompt_token_ids
is
not
None
assert
final_res
.
prompt_token_ids
is
not
None
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
len
(
final_output
.
token_ids
)
num_generated_tokens
=
len
(
final_output
.
token_ids
)
usage
=
UsageInfo
(
num_cached_tokens
=
final_res
.
num_cached_tokens
prompt_tokens
=
num_prompt_tokens
,
num_reasoning_tokens
=
0
completion_tokens
=
num_generated_tokens
,
usage
=
ResponseUsage
(
input_tokens
=
num_prompt_tokens
,
output_tokens
=
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
input_tokens_details
=
InputTokensDetails
(
cached_tokens
=
num_cached_tokens
),
output_tokens_details
=
OutputTokensDetails
(
reasoning_tokens
=
num_reasoning_tokens
),
)
)
if
self
.
enable_prompt_tokens_details
and
final_res
.
num_cached_tokens
:
usage
.
prompt_tokens_details
=
PromptTokenUsageInfo
(
cached_tokens
=
final_res
.
num_cached_tokens
)
request_metadata
.
final_usage_info
=
usage
response
=
ResponsesResponse
.
from_request
(
response
=
ResponsesResponse
.
from_request
(
request
,
request
,
sampling_params
,
sampling_params
,
...
@@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
self
.
response_store
[
response
.
id
]
=
response
self
.
response_store
[
response
.
id
]
=
response
return
response
return
response
def
_make_response_output_items
(
self
,
request
:
ResponsesRequest
,
final_output
:
CompletionOutput
,
tokenizer
:
AnyTokenizer
,
)
->
list
[
ResponseOutputItem
]:
if
self
.
reasoning_parser
:
try
:
reasoning_parser
=
self
.
reasoning_parser
(
tokenizer
)
except
RuntimeError
as
e
:
logger
.
exception
(
"Error in reasoning parser creation."
)
raise
e
reasoning_content
,
content
=
(
reasoning_parser
.
extract_reasoning_content
(
final_output
.
text
,
request
=
request
))
else
:
reasoning_content
=
None
content
=
final_output
.
text
output
=
[]
if
reasoning_content
:
reasoning_item
=
ResponseReasoningItem
(
id
=
f
"rs_
{
random_uuid
()
}
"
,
summary
=
[],
type
=
"reasoning"
,
content
=
[
ResponseReasoningTextContent
(
text
=
reasoning_content
,
type
=
"reasoning_text"
)
],
status
=
None
,
# NOTE: Only the last output item has status.
)
output
.
append
(
reasoning_item
)
if
content
:
output_text
=
ResponseOutputText
(
text
=
content
,
annotations
=
[],
# TODO
type
=
"output_text"
,
logprobs
=
None
,
# TODO
)
message
=
ResponseOutputMessage
(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
[
output_text
],
role
=
"assistant"
,
status
=
"completed"
,
type
=
"message"
,
)
output
.
append
(
message
)
return
output
def
_make_response_output_items_with_harmony
(
self
,
context
:
HarmonyContext
,
)
->
list
[
ResponseOutputItem
]:
output_items
=
[]
num_init_messages
=
context
.
num_init_messages
for
msg
in
context
.
messages
[
num_init_messages
:]:
output_items
.
extend
(
parse_output_message
(
msg
))
# Handle the generation stopped in the middle (if any).
last_items
=
parse_remaining_state
(
context
.
parser
)
if
last_items
:
output_items
.
extend
(
last_items
)
return
output_items
def
_construct_input_messages
(
def
_construct_input_messages
(
self
,
self
,
request
:
ResponsesRequest
,
request
:
ResponsesRequest
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment