Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1823a00d
Unverified
Commit
1823a00d
authored
Sep 08, 2025
by
Ming Yang
Committed by
GitHub
Sep 08, 2025
Browse files
[Misc] Support bench serve long context (#24373)
Signed-off-by:
Ming Yang
<
minos.future@gmail.com
>
parent
ed16d0f2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
167 additions
and
84 deletions
+167
-84
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+31
-0
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/endpoint_request_func.py
+136
-84
No files found.
tests/benchmarks/test_serve_cli.py
View file @
1823a00d
...
...
@@ -45,3 +45,34 @@ def test_bench_serve(server):
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
@
pytest
.
mark
.
benchmark
def
test_bench_serve_chat
(
server
):
command
=
[
"vllm"
,
"bench"
,
"serve"
,
"--model"
,
MODEL_NAME
,
"--host"
,
server
.
host
,
"--port"
,
str
(
server
.
port
),
"--dataset-name"
,
"random"
,
"--random-input-len"
,
"32"
,
"--random-output-len"
,
"4"
,
"--num-prompts"
,
"5"
,
"--endpoint"
,
"/v1/chat/completions"
,
"--endpoint-type"
,
"openai-chat"
,
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
vllm/benchmarks/lib/endpoint_request_func.py
View file @
1823a00d
...
...
@@ -17,6 +17,47 @@ from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
class
StreamedResponseHandler
:
"""Handles streaming HTTP responses by accumulating chunks until complete
messages are available."""
def
__init__
(
self
):
self
.
buffer
=
""
def
add_chunk
(
self
,
chunk_bytes
:
bytes
)
->
list
[
str
]:
"""Add a chunk of bytes to the buffer and return any complete
messages."""
chunk_str
=
chunk_bytes
.
decode
(
"utf-8"
)
self
.
buffer
+=
chunk_str
messages
=
[]
# Split by double newlines (SSE message separator)
while
"
\n\n
"
in
self
.
buffer
:
message
,
self
.
buffer
=
self
.
buffer
.
split
(
"
\n\n
"
,
1
)
message
=
message
.
strip
()
if
message
:
messages
.
append
(
message
)
# if self.buffer is not empty, check if it is a complete message
# by removing data: prefix and check if it is a valid JSON
if
self
.
buffer
.
startswith
(
"data: "
):
message_content
=
self
.
buffer
.
removeprefix
(
"data: "
).
strip
()
if
message_content
==
"[DONE]"
:
messages
.
append
(
self
.
buffer
.
strip
())
self
.
buffer
=
""
elif
message_content
:
try
:
json
.
loads
(
message_content
)
messages
.
append
(
self
.
buffer
.
strip
())
self
.
buffer
=
""
except
json
.
JSONDecodeError
:
# Incomplete JSON, wait for more chunks.
pass
return
messages
@
dataclass
class
RequestFuncInput
:
"""The input for the request function."""
...
...
@@ -102,18 +143,22 @@ async def async_request_openai_completions(
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
first_chunk_received
=
False
async
for
chunk_bytes
in
response
.
content
:
handler
=
StreamedResponseHandler
()
async
for
chunk_bytes
in
response
.
content
.
iter_any
():
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
messages
=
handler
.
add_chunk
(
chunk_bytes
)
for
message
in
messages
:
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if
chunk_bytes
.
startswith
(
":"
):
if
message
.
startswith
(
":"
):
continue
chunk
=
chunk_bytes
.
removeprefix
(
"data: "
)
chunk
=
message
.
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
data
=
json
.
loads
(
chunk
)
...
...
@@ -227,18 +272,21 @@ async def async_request_openai_chat_completions(
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
handler
=
StreamedResponseHandler
()
async
for
chunk_bytes
in
response
.
content
.
iter_any
():
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
messages
=
handler
.
add_chunk
(
chunk_bytes
)
for
message
in
messages
:
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if
chunk_bytes
.
startswith
(
":"
):
if
message
.
startswith
(
":"
):
continue
chunk
=
chunk_bytes
.
removeprefix
(
"data: "
)
chunk
=
message
.
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
...
...
@@ -347,12 +395,16 @@ async def async_request_openai_audio(
data
=
form
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
handler
=
StreamedResponseHandler
()
async
for
chunk_bytes
in
response
.
content
.
iter_any
():
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
messages
=
handler
.
add_chunk
(
chunk_bytes
)
for
message
in
messages
:
chunk
=
message
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment