Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
df850c49
Unverified
Commit
df850c49
authored
Oct 14, 2025
by
Chauncey
Committed by
GitHub
Oct 14, 2025
Browse files
[Feature][Responses API] Stream Function Call - harmony (#24317)
Signed-off-by:
chaunceyjiang
<
chaunceyjiang@gmail.com
>
parent
720394de
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
213 additions
and
70 deletions
+213
-70
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+136
-66
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+77
-4
No files found.
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
df850c49
...
@@ -16,6 +16,22 @@ from ...utils import RemoteOpenAIServer
...
@@ -16,6 +16,22 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"openai/gpt-oss-20b"
MODEL_NAME
=
"openai/gpt-oss-20b"
GET_WEATHER_SCHEMA
=
{
"type"
:
"function"
,
"name"
:
"get_weather"
,
"description"
:
"Get current temperature for provided coordinates in celsius."
,
# noqa
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"latitude"
:
{
"type"
:
"number"
},
"longitude"
:
{
"type"
:
"number"
},
},
"required"
:
[
"latitude"
,
"longitude"
],
"additionalProperties"
:
False
,
},
"strict"
:
True
,
}
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
...
@@ -305,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str):
...
@@ -305,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str):
assert
len
(
stack_of_event_types
)
==
0
assert
len
(
stack_of_event_types
)
==
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_function_calling_with_streaming_types
(
client
:
OpenAI
,
model_name
:
str
):
# this links the "done" type with the "start" type
# so every "done" type should have a corresponding "start" type
# and every open block should be closed by the end of the stream
pairs_of_event_types
=
{
"response.completed"
:
"response.created"
,
"response.output_item.done"
:
"response.output_item.added"
,
"response.output_text.done"
:
"response.output_text.delta"
,
"response.reasoning_text.done"
:
"response.reasoning_text.delta"
,
"response.reasoning_part.done"
:
"response.reasoning_part.added"
,
"response.function_call_arguments.done"
:
"response.function_call_arguments.delta"
,
# noqa
}
tools
=
[
GET_WEATHER_SCHEMA
]
input_list
=
[
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
,
}
]
stream_response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
input_list
,
tools
=
tools
,
stream
=
True
,
)
stack_of_event_types
=
[]
async
for
event
in
stream_response
:
if
event
.
type
==
"response.created"
:
stack_of_event_types
.
append
(
event
.
type
)
elif
event
.
type
==
"response.completed"
:
assert
stack_of_event_types
[
-
1
]
==
pairs_of_event_types
[
event
.
type
]
stack_of_event_types
.
pop
()
if
event
.
type
.
endswith
(
"added"
):
stack_of_event_types
.
append
(
event
.
type
)
elif
event
.
type
.
endswith
(
"delta"
):
if
stack_of_event_types
[
-
1
]
==
event
.
type
:
continue
stack_of_event_types
.
append
(
event
.
type
)
elif
event
.
type
.
endswith
(
"done"
):
assert
stack_of_event_types
[
-
1
]
==
pairs_of_event_types
[
event
.
type
]
stack_of_event_types
.
pop
()
assert
len
(
stack_of_event_types
)
==
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"background"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"background"
,
[
True
,
False
])
...
@@ -483,23 +547,7 @@ def call_function(name, args):
...
@@ -483,23 +547,7 @@ def call_function(name, args):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_function_calling
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_function_calling
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
tools
=
[
GET_WEATHER_SCHEMA
]
{
"type"
:
"function"
,
"name"
:
"get_weather"
,
"description"
:
"Get current temperature for provided coordinates in celsius."
,
# noqa
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"latitude"
:
{
"type"
:
"number"
},
"longitude"
:
{
"type"
:
"number"
},
},
"required"
:
[
"latitude"
,
"longitude"
],
"additionalProperties"
:
False
,
},
"strict"
:
True
,
}
]
response
=
await
client
.
responses
.
create
(
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
model
=
model_name
,
...
@@ -565,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
...
@@ -565,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
},
},
"strict"
:
True
,
"strict"
:
True
,
},
},
{
GET_WEATHER_SCHEMA
,
"type"
:
"function"
,
"name"
:
"get_weather"
,
"description"
:
"Get current temperature for provided coordinates in celsius."
,
# noqa
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"latitude"
:
{
"type"
:
"number"
},
"longitude"
:
{
"type"
:
"number"
},
},
"required"
:
[
"latitude"
,
"longitude"
],
"additionalProperties"
:
False
,
},
"strict"
:
True
,
},
]
]
response
=
await
client
.
responses
.
create
(
response
=
await
client
.
responses
.
create
(
...
@@ -643,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
...
@@ -643,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_function_calling_required
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_function_calling_required
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
tools
=
[
GET_WEATHER_SCHEMA
]
{
"type"
:
"function"
,
"name"
:
"get_weather"
,
"description"
:
"Get current temperature for provided coordinates in celsius."
,
# noqa
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"latitude"
:
{
"type"
:
"number"
},
"longitude"
:
{
"type"
:
"number"
},
},
"required"
:
[
"latitude"
,
"longitude"
],
"additionalProperties"
:
False
,
},
"strict"
:
True
,
}
]
with
pytest
.
raises
(
BadRequestError
):
with
pytest
.
raises
(
BadRequestError
):
await
client
.
responses
.
create
(
await
client
.
responses
.
create
(
...
@@ -689,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str):
...
@@ -689,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_function_calling_full_history
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_function_calling_full_history
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
tools
=
[
GET_WEATHER_SCHEMA
]
{
"type"
:
"function"
,
"name"
:
"get_weather"
,
"description"
:
"Get current temperature for provided coordinates in celsius."
,
# noqa
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"latitude"
:
{
"type"
:
"number"
},
"longitude"
:
{
"type"
:
"number"
},
},
"required"
:
[
"latitude"
,
"longitude"
],
"additionalProperties"
:
False
,
},
"strict"
:
True
,
}
]
input_messages
=
[
input_messages
=
[
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
}
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
}
...
@@ -745,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
...
@@ -745,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
assert
response_2
.
output_text
is
not
None
assert
response_2
.
output_text
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_function_calling_with_stream
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
GET_WEATHER_SCHEMA
]
input_list
=
[
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
,
}
]
stream_response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
input_list
,
tools
=
tools
,
stream
=
True
,
)
assert
stream_response
is
not
None
final_tool_calls
=
{}
final_tool_calls_named
=
{}
async
for
event
in
stream_response
:
if
event
.
type
==
"response.output_item.added"
:
if
event
.
item
.
type
!=
"function_call"
:
continue
final_tool_calls
[
event
.
output_index
]
=
event
.
item
final_tool_calls_named
[
event
.
item
.
name
]
=
event
.
item
elif
event
.
type
==
"response.function_call_arguments.delta"
:
index
=
event
.
output_index
tool_call
=
final_tool_calls
[
index
]
if
tool_call
:
tool_call
.
arguments
+=
event
.
delta
final_tool_calls_named
[
tool_call
.
name
]
=
tool_call
elif
event
.
type
==
"response.function_call_arguments.done"
:
assert
event
.
arguments
==
final_tool_calls_named
[
event
.
name
].
arguments
for
tool_call
in
final_tool_calls
.
values
():
if
(
tool_call
and
tool_call
.
type
==
"function_call"
and
tool_call
.
name
==
"get_weather"
):
args
=
json
.
loads
(
tool_call
.
arguments
)
result
=
call_function
(
tool_call
.
name
,
args
)
input_list
+=
[
tool_call
]
break
assert
result
is
not
None
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
input_list
+
[
{
"type"
:
"function_call_output"
,
"call_id"
:
tool_call
.
call_id
,
"output"
:
str
(
result
),
}
],
tools
=
tools
,
stream
=
True
,
)
assert
response
is
not
None
async
for
event
in
response
:
# check that no function call events in the stream
assert
event
.
type
!=
"response.function_call_arguments.delta"
assert
event
.
type
!=
"response.function_call_arguments.done"
# check that the response contains output text
if
event
.
type
==
"response.completed"
:
assert
len
(
event
.
response
.
output
)
>
0
assert
event
.
response
.
output_text
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_output_messages_enabled
(
client
:
OpenAI
,
model_name
:
str
,
server
):
async
def
test_output_messages_enabled
(
client
:
OpenAI
,
model_name
:
str
,
server
):
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
df850c49
...
@@ -23,6 +23,8 @@ from openai.types.responses import (
...
@@ -23,6 +23,8 @@ from openai.types.responses import (
ResponseCodeInterpreterToolCallParam
,
ResponseCodeInterpreterToolCallParam
,
ResponseContentPartAddedEvent
,
ResponseContentPartAddedEvent
,
ResponseContentPartDoneEvent
,
ResponseContentPartDoneEvent
,
ResponseFunctionCallArgumentsDeltaEvent
,
ResponseFunctionCallArgumentsDoneEvent
,
ResponseFunctionToolCall
,
ResponseFunctionToolCall
,
ResponseFunctionWebSearch
,
ResponseFunctionWebSearch
,
ResponseOutputItem
,
ResponseOutputItem
,
...
@@ -927,6 +929,11 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -927,6 +929,11 @@ class OpenAIServingResponses(OpenAIServing):
# to add the tool call request to prev_outputs so that the
# to add the tool call request to prev_outputs so that the
# parse_response_input can find the tool call request when
# parse_response_input can find the tool call request when
# parsing the tool call output.
# parsing the tool call output.
if
(
isinstance
(
response_msg
,
dict
)
and
response_msg
.
get
(
"type"
)
==
"function_call"
):
response_msg
=
ResponseFunctionToolCall
.
model_validate
(
response_msg
)
if
isinstance
(
response_msg
,
ResponseFunctionToolCall
):
if
isinstance
(
response_msg
,
ResponseFunctionToolCall
):
prev_outputs
.
append
(
response_msg
)
prev_outputs
.
append
(
response_msg
)
return
messages
return
messages
...
@@ -1398,19 +1405,48 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -1398,19 +1405,48 @@ class OpenAIServingResponses(OpenAIServing):
current_output_index
=
0
current_output_index
=
0
current_item_id
:
str
=
""
current_item_id
:
str
=
""
sent_output_item_added
=
False
sent_output_item_added
=
False
is_first_function_call_delta
=
False
async
for
ctx
in
result_generator
:
async
for
ctx
in
result_generator
:
assert
isinstance
(
ctx
,
StreamingHarmonyContext
)
assert
isinstance
(
ctx
,
StreamingHarmonyContext
)
if
ctx
.
is_expecting_start
():
if
ctx
.
is_expecting_start
():
current_output_index
+=
1
current_output_index
+=
1
sent_output_item_added
=
False
sent_output_item_added
=
False
is_first_function_call_delta
=
False
if
len
(
ctx
.
parser
.
messages
)
>
0
:
if
len
(
ctx
.
parser
.
messages
)
>
0
:
previous_item
=
ctx
.
parser
.
messages
[
-
1
]
previous_item
=
ctx
.
parser
.
messages
[
-
1
]
if
previous_item
.
recipient
is
not
None
:
if
previous_item
.
recipient
is
not
None
:
# Deal with tool call here
# Deal with tool call
pass
if
previous_item
.
recipient
.
startswith
(
"functions."
):
function_name
=
previous_item
.
recipient
[
len
(
"functions."
)
:]
yield
_increment_sequence_number_and_return
(
ResponseFunctionCallArgumentsDoneEvent
(
type
=
"response.function_call_arguments.done"
,
arguments
=
previous_item
.
content
[
0
].
text
,
name
=
function_name
,
item_id
=
current_item_id
,
output_index
=
current_output_index
,
sequence_number
=-
1
,
)
)
function_call_item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
arguments
=
previous_item
.
content
[
0
].
text
,
name
=
function_name
,
item_id
=
current_item_id
,
output_index
=
current_output_index
,
sequence_number
=-
1
,
call_id
=
f
"fc_
{
random_uuid
()
}
"
,
status
=
"completed"
,
)
yield
_increment_sequence_number_and_return
(
ResponseOutputItemDoneEvent
(
type
=
"response.output_item.done"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
function_call_item
,
)
)
elif
previous_item
.
channel
==
"analysis"
:
elif
previous_item
.
channel
==
"analysis"
:
content
=
ResponseReasoningTextContent
(
content
=
ResponseReasoningTextContent
(
text
=
previous_item
.
content
[
0
].
text
,
text
=
previous_item
.
content
[
0
].
text
,
...
@@ -1766,6 +1802,43 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -1766,6 +1802,43 @@ class OpenAIServingResponses(OpenAIServing):
),
),
)
)
)
)
# developer tools will be triggered on the commentary channel
# and recipient starts with "functions.TOOL_NAME"
if
(
ctx
.
parser
.
current_channel
==
"commentary"
and
ctx
.
parser
.
current_recipient
and
ctx
.
parser
.
current_recipient
.
startswith
(
"functions."
)
):
if
is_first_function_call_delta
is
False
:
is_first_function_call_delta
=
True
fc_name
=
ctx
.
parser
.
current_recipient
[
len
(
"functions."
)
:]
tool_call_item
=
ResponseFunctionToolCall
(
name
=
fc_name
,
type
=
"function_call"
,
id
=
current_item_id
,
call_id
=
f
"call_
{
random_uuid
()
}
"
,
arguments
=
""
,
status
=
"in_progress"
,
)
current_item_id
=
f
"fc_
{
random_uuid
()
}
"
yield
_increment_sequence_number_and_return
(
ResponseOutputItemAddedEvent
(
type
=
"response.output_item.added"
,
sequence_number
=-
1
,
output_index
=
current_output_index
,
item
=
tool_call_item
,
)
)
else
:
yield
_increment_sequence_number_and_return
(
ResponseFunctionCallArgumentsDeltaEvent
(
item_id
=
current_item_id
,
delta
=
ctx
.
parser
.
last_content_delta
,
output_index
=
current_output_index
,
sequence_number
=-
1
,
type
=
"response.function_call_arguments.delta"
,
)
)
async
def
responses_stream_generator
(
async
def
responses_stream_generator
(
self
,
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment