Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
25aba2b6
Unverified
Commit
25aba2b6
authored
Sep 15, 2025
by
Andrew Xia
Committed by
GitHub
Sep 15, 2025
Browse files
[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by:
Andrew Xia
<
axia@meta.com
>
parent
94b03f88
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
60 additions
and
18 deletions
+60
-18
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+14
-0
vllm/entrypoints/context.py
vllm/entrypoints/context.py
+15
-10
vllm/entrypoints/harmony_utils.py
vllm/entrypoints/harmony_utils.py
+3
-1
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+13
-4
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+15
-3
No files found.
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
25aba2b6
...
...
@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert
response
.
status
==
"completed"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_max_tokens
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
"What is the first paragraph of Moby Dick?"
,
reasoning
=
{
"effort"
:
"low"
},
max_output_tokens
=
30
,
)
assert
response
is
not
None
assert
response
.
status
==
"incomplete"
assert
response
.
incomplete_details
.
reason
==
"max_output_tokens"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_chat
(
client
:
OpenAI
,
model_name
:
str
):
...
...
vllm/entrypoints/context.py
View file @
25aba2b6
...
...
@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
available_tools
:
list
[
str
],
):
self
.
_messages
=
messages
self
.
finish_reason
:
Optional
[
str
]
=
None
self
.
available_tools
=
available_tools
self
.
_tool_sessions
:
dict
[
str
,
Union
[
ClientSession
,
Tool
]]
=
{}
self
.
called_tools
:
set
[
str
]
=
set
()
...
...
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
if
self
.
parser
.
current_channel
in
{
"analysis"
,
"commentary"
}:
self
.
num_reasoning_tokens
+=
1
def
append_output
(
self
,
output
)
->
None
:
def
append_output
(
self
,
output
:
Union
[
RequestOutput
,
list
[
Message
]])
->
None
:
if
isinstance
(
output
,
RequestOutput
):
output_token_ids
=
output
.
outputs
[
0
].
token_ids
self
.
parser
=
get_streamable_parser_for_assistant
()
...
...
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
# Move current turn to previous turn for next turn's calculations
self
.
previous_turn
=
self
.
current_turn
.
copy
()
output_msgs
=
self
.
parser
.
messages
# The responses finish reason is set in the last message
self
.
finish_reason
=
output
.
outputs
[
0
].
finish_reason
else
:
# Tool output.
output_msgs
=
output
...
...
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
def
messages
(
self
)
->
list
:
return
self
.
parser
.
messages
def
append_output
(
self
,
output
)
->
None
:
def
append_output
(
self
,
output
:
Union
[
RequestOutput
,
list
[
Message
]])
->
None
:
if
isinstance
(
output
,
RequestOutput
):
# append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message.
...
...
vllm/entrypoints/harmony_utils.py
View file @
25aba2b6
...
...
@@ -387,7 +387,9 @@ def parse_remaining_state(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
[
output_text
],
role
=
"assistant"
,
status
=
"completed"
,
# if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status
=
"incomplete"
,
type
=
"message"
,
)
return
[
text_item
]
...
...
vllm/entrypoints/openai/protocol.py
View file @
25aba2b6
...
...
@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
from
openai.types.responses
import
(
ResponseFormatTextConfig
as
ResponseTextConfig
)
from
openai.types.responses.response
import
ToolChoice
from
openai.types.responses.response
import
IncompleteDetails
,
ToolChoice
from
openai.types.responses.tool
import
Tool
from
openai.types.shared
import
Metadata
,
Reasoning
from
pydantic
import
(
BaseModel
,
ConfigDict
,
Field
,
TypeAdapter
,
...
...
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"resp_
{
random_uuid
()
}
"
)
created_at
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
# error: Optional[ResponseError] = None
#
incomplete_details: Optional[IncompleteDetails] = None
incomplete_details
:
Optional
[
IncompleteDetails
]
=
None
instructions
:
Optional
[
str
]
=
None
metadata
:
Optional
[
Metadata
]
=
None
model
:
str
...
...
@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
status
:
ResponseStatus
,
usage
:
Optional
[
ResponseUsage
]
=
None
,
)
->
"ResponsesResponse"
:
incomplete_details
:
Optional
[
IncompleteDetails
]
=
None
if
status
==
'incomplete'
:
incomplete_details
=
IncompleteDetails
(
reason
=
'max_output_tokens'
)
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
return
cls
(
id
=
request
.
request_id
,
created_at
=
created_time
,
incomplete_details
=
incomplete_details
,
instructions
=
request
.
instructions
,
metadata
=
request
.
metadata
,
model
=
model_name
,
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
25aba2b6
...
...
@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem
,
ResponseReasoningTextDeltaEvent
,
ResponseReasoningTextDoneEvent
,
response_text_delta_event
)
ResponseStatus
,
response_text_delta_event
)
from
openai.types.responses.response_output_text
import
(
Logprob
,
LogprobTopLogprob
)
# yapf: enable
...
...
@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status
:
ResponseStatus
=
"completed"
if
self
.
use_harmony
:
assert
isinstance
(
context
,
HarmonyContext
)
output
=
self
.
_make_response_output_items_with_harmony
(
context
)
num_tool_output_tokens
=
context
.
num_tool_output_tokens
if
len
(
output
)
>
0
:
if
context
.
finish_reason
==
"length"
:
status
=
"incomplete"
elif
context
.
finish_reason
==
"abort"
:
status
=
"cancelled"
else
:
status
=
"incomplete"
else
:
assert
isinstance
(
context
,
SimpleContext
)
final_res
=
context
.
last_output
...
...
@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
model_name
=
model_name
,
created_time
=
created_time
,
output
=
output
,
status
=
"completed"
,
status
=
status
,
usage
=
usage
,
)
...
...
@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
self
,
context
:
HarmonyContext
,
)
->
list
[
ResponseOutputItem
]:
output_items
=
[]
output_items
:
list
[
ResponseOutputItem
]
=
[]
num_init_messages
=
context
.
num_init_messages
for
msg
in
context
.
messages
[
num_init_messages
:]:
output_items
.
extend
(
parse_output_message
(
msg
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment