Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
25aba2b6
Unverified
Commit
25aba2b6
authored
Sep 15, 2025
by
Andrew Xia
Committed by
GitHub
Sep 15, 2025
Browse files
[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by:
Andrew Xia
<
axia@meta.com
>
parent
94b03f88
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
67 additions
and
25 deletions
+67
-25
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+14
-0
vllm/entrypoints/context.py
vllm/entrypoints/context.py
+15
-10
vllm/entrypoints/harmony_utils.py
vllm/entrypoints/harmony_utils.py
+3
-1
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+13
-4
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+15
-3
vllm/v1/core/sched/utils.py
vllm/v1/core/sched/utils.py
+4
-4
vllm/v1/engine/output_processor.py
vllm/v1/engine/output_processor.py
+3
-3
No files found.
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
25aba2b6
...
...
@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert
response
.
status
==
"completed"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_max_tokens
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
"What is the first paragraph of Moby Dick?"
,
reasoning
=
{
"effort"
:
"low"
},
max_output_tokens
=
30
,
)
assert
response
is
not
None
assert
response
.
status
==
"incomplete"
assert
response
.
incomplete_details
.
reason
==
"max_output_tokens"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_chat
(
client
:
OpenAI
,
model_name
:
str
):
...
...
vllm/entrypoints/context.py
View file @
25aba2b6
...
...
@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
available_tools
:
list
[
str
],
):
self
.
_messages
=
messages
self
.
finish_reason
:
Optional
[
str
]
=
None
self
.
available_tools
=
available_tools
self
.
_tool_sessions
:
dict
[
str
,
Union
[
ClientSession
,
Tool
]]
=
{}
self
.
called_tools
:
set
[
str
]
=
set
()
...
...
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
if
self
.
parser
.
current_channel
in
{
"analysis"
,
"commentary"
}:
self
.
num_reasoning_tokens
+=
1
def
append_output
(
self
,
output
)
->
None
:
def
append_output
(
self
,
output
:
Union
[
RequestOutput
,
list
[
Message
]])
->
None
:
if
isinstance
(
output
,
RequestOutput
):
output_token_ids
=
output
.
outputs
[
0
].
token_ids
self
.
parser
=
get_streamable_parser_for_assistant
()
...
...
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
# Move current turn to previous turn for next turn's calculations
self
.
previous_turn
=
self
.
current_turn
.
copy
()
output_msgs
=
self
.
parser
.
messages
# The responses finish reason is set in the last message
self
.
finish_reason
=
output
.
outputs
[
0
].
finish_reason
else
:
# Tool output.
output_msgs
=
output
...
...
@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):
def
_update_prefill_token_usage
(
self
,
output
:
RequestOutput
)
->
None
:
"""Update token usage statistics for the prefill phase of generation.
The prefill phase processes the input prompt tokens. This method:
1. Counts the prompt tokens for this turn
2. Calculates tool output tokens for multi-turn conversations
3. Updates cached token counts
4. Tracks state for next turn calculations
Tool output tokens are calculated as:
current_prompt_tokens - last_turn_prompt_tokens -
current_prompt_tokens - last_turn_prompt_tokens -
last_turn_output_tokens
This represents tokens added between turns (typically tool responses).
Args:
output: The RequestOutput containing prompt token information
"""
...
...
@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):
def
_update_decode_token_usage
(
self
,
output
:
RequestOutput
)
->
int
:
"""Update token usage statistics for the decode phase of generation.
The decode phase processes the generated output tokens. This method:
1. Counts output tokens from all completion outputs
2. Updates the total output token count
3. Tracks tokens generated in the current turn
In streaming mode, this is called for each token generated.
In non-streaming mode, this is called once with all output tokens.
Args:
output: The RequestOutput containing generated token information
Returns:
int: Number of output tokens processed in this call
"""
...
...
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
def
messages
(
self
)
->
list
:
return
self
.
parser
.
messages
def
append_output
(
self
,
output
)
->
None
:
def
append_output
(
self
,
output
:
Union
[
RequestOutput
,
list
[
Message
]])
->
None
:
if
isinstance
(
output
,
RequestOutput
):
# append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message.
...
...
vllm/entrypoints/harmony_utils.py
View file @
25aba2b6
...
...
@@ -387,7 +387,9 @@ def parse_remaining_state(
id
=
f
"msg_
{
random_uuid
()
}
"
,
content
=
[
output_text
],
role
=
"assistant"
,
status
=
"completed"
,
# if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status
=
"incomplete"
,
type
=
"message"
,
)
return
[
text_item
]
...
...
vllm/entrypoints/openai/protocol.py
View file @
25aba2b6
...
...
@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
from
openai.types.responses
import
(
ResponseFormatTextConfig
as
ResponseTextConfig
)
from
openai.types.responses.response
import
ToolChoice
from
openai.types.responses.response
import
IncompleteDetails
,
ToolChoice
from
openai.types.responses.tool
import
Tool
from
openai.types.shared
import
Metadata
,
Reasoning
from
pydantic
import
(
BaseModel
,
ConfigDict
,
Field
,
TypeAdapter
,
...
...
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"resp_
{
random_uuid
()
}
"
)
created_at
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
# error: Optional[ResponseError] = None
#
incomplete_details: Optional[IncompleteDetails] = None
incomplete_details
:
Optional
[
IncompleteDetails
]
=
None
instructions
:
Optional
[
str
]
=
None
metadata
:
Optional
[
Metadata
]
=
None
model
:
str
...
...
@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
status
:
ResponseStatus
,
usage
:
Optional
[
ResponseUsage
]
=
None
,
)
->
"ResponsesResponse"
:
incomplete_details
:
Optional
[
IncompleteDetails
]
=
None
if
status
==
'incomplete'
:
incomplete_details
=
IncompleteDetails
(
reason
=
'max_output_tokens'
)
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
return
cls
(
id
=
request
.
request_id
,
created_at
=
created_time
,
incomplete_details
=
incomplete_details
,
instructions
=
request
.
instructions
,
metadata
=
request
.
metadata
,
model
=
model_name
,
...
...
@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):
class
TokenizerInfoResponse
(
OpenAIBaseModel
):
"""
Response containing tokenizer configuration
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""
...
...
@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to_language
:
Optional
[
str
]
=
None
"""The language of the output audio we transcribe to.
Please note that this is not currently used by supported models at this
Please note that this is not currently used by supported models at this
time, but it is a placeholder for future use, matching translation api.
"""
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
25aba2b6
...
...
@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem
,
ResponseReasoningTextDeltaEvent
,
ResponseReasoningTextDoneEvent
,
response_text_delta_event
)
ResponseStatus
,
response_text_delta_event
)
from
openai.types.responses.response_output_text
import
(
Logprob
,
LogprobTopLogprob
)
# yapf: enable
...
...
@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status
:
ResponseStatus
=
"completed"
if
self
.
use_harmony
:
assert
isinstance
(
context
,
HarmonyContext
)
output
=
self
.
_make_response_output_items_with_harmony
(
context
)
num_tool_output_tokens
=
context
.
num_tool_output_tokens
if
len
(
output
)
>
0
:
if
context
.
finish_reason
==
"length"
:
status
=
"incomplete"
elif
context
.
finish_reason
==
"abort"
:
status
=
"cancelled"
else
:
status
=
"incomplete"
else
:
assert
isinstance
(
context
,
SimpleContext
)
final_res
=
context
.
last_output
...
...
@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
model_name
=
model_name
,
created_time
=
created_time
,
output
=
output
,
status
=
"completed"
,
status
=
status
,
usage
=
usage
,
)
...
...
@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
self
,
context
:
HarmonyContext
,
)
->
list
[
ResponseOutputItem
]:
output_items
=
[]
output_items
:
list
[
ResponseOutputItem
]
=
[]
num_init_messages
=
context
.
num_init_messages
for
msg
in
context
.
messages
[
num_init_messages
:]:
output_items
.
extend
(
parse_output_message
(
msg
))
...
...
vllm/v1/core/sched/utils.py
View file @
25aba2b6
...
...
@@ -10,19 +10,19 @@ from vllm.v1.request import Request, RequestStatus
def
remove_all
(
lst
:
list
,
items_to_remove
:
set
)
->
list
:
"""Remove all items from a list that are in the items_to_remove set.
This method optimizes for the common case of removing a single item,
falling back to list comprehension for multiple items.
Args:
lst: The list to remove items from
items_to_remove: Set of items to remove
Returns:
Either the modified original list (for single item removal) or
a new list (for multiple item removal). Callers should use the
returned value.
Note:
For single item removal, this modifies the original list in-place
and returns it. For multiple items, it creates and returns a new list.
...
...
vllm/v1/engine/output_processor.py
View file @
25aba2b6
...
...
@@ -373,17 +373,17 @@ class OutputProcessor:
1) Compute stats for logging
2) Detokenize
3) Create and handle RequestOutput objects:
* If there is a queue (for usage with AsyncLLM),
* If there is a queue (for usage with AsyncLLM),
put the RequestOutput objects into the queue for
handling by the per-request generate() tasks.
* If there is no queue (for usage with LLMEngine),
* If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects.
NOTE FOR DEVELOPERS
vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.
If you need to touch every element of the batch, do it from
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment