Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8f423e5f
Unverified
Commit
8f423e5f
authored
Sep 04, 2025
by
Kebe
Committed by
GitHub
Sep 04, 2025
Browse files
[Feature][Response API] Add streaming support for non-harmony (#23741)
Signed-off-by:
Kebe
<
mail@kebe7jun.com
>
parent
369a0795
Changes
3
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
407 additions
and
77 deletions
+407
-77
tests/v1/entrypoints/openai/responses/test_basic.py
tests/v1/entrypoints/openai/responses/test_basic.py
+16
-0
vllm/entrypoints/context.py
vllm/entrypoints/context.py
+10
-0
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+381
-77
No files found.
tests/v1/entrypoints/openai/responses/test_basic.py
View file @
8f423e5f
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
openai
# use the official client for correctness check
import
openai.types.responses
as
openai_responses_types
import
pytest
...
...
@@ -86,3 +87,18 @@ async def test_logprobs(client: openai.AsyncOpenAI):
outputs
=
response
.
output
assert
outputs
[
-
1
].
content
[
-
1
].
logprobs
assert
len
(
outputs
[
-
1
].
content
[
-
1
].
logprobs
[
0
].
top_logprobs
)
==
5
@
pytest
.
mark
.
asyncio
async
def
test_streaming
(
client
:
openai
.
AsyncOpenAI
):
stream
=
await
client
.
responses
.
create
(
input
=
"What is 13 * 24?"
,
stream
=
True
,
)
events
=
[
event
async
for
event
in
stream
]
assert
isinstance
(
events
[
0
],
openai_responses_types
.
ResponseCreatedEvent
)
assert
any
(
isinstance
(
event
,
openai_responses_types
.
ResponseTextDeltaEvent
)
for
event
in
events
)
assert
isinstance
(
events
[
-
1
],
openai_responses_types
.
ResponseCompletedEvent
)
vllm/entrypoints/context.py
View file @
8f423e5f
...
...
@@ -49,9 +49,19 @@ class SimpleContext(ConversationContext):
def
__init__
(
self
):
self
.
last_output
=
None
self
.
num_prompt_tokens
=
0
self
.
num_output_tokens
=
0
self
.
num_cached_tokens
=
0
# todo num_reasoning_tokens is not implemented yet.
self
.
num_reasoning_tokens
=
0
def
append_output
(
self
,
output
)
->
None
:
self
.
last_output
=
output
if
not
isinstance
(
output
,
RequestOutput
):
raise
ValueError
(
"SimpleContext only supports RequestOutput."
)
self
.
num_prompt_tokens
=
len
(
output
.
prompt_token_ids
or
[])
self
.
num_cached_tokens
=
output
.
num_cached_tokens
or
0
self
.
num_output_tokens
+=
len
(
output
.
outputs
[
0
].
token_ids
or
[])
def
need_builtin_tool_call
(
self
)
->
bool
:
return
False
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
8f423e5f
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment