Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3380543b
Unverified
Commit
3380543b
authored
Nov 11, 2025
by
Ido Segev
Committed by
GitHub
Nov 11, 2025
Browse files
Add request timeout override for multi-turn benchmarks (#28386)
Signed-off-by:
Ido Segev
<
idos@pliops.com
>
parent
afffd3cc
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
35 additions
and
5 deletions
+35
-5
benchmarks/multi_turn/benchmark_serving_multi_turn.py
benchmarks/multi_turn/benchmark_serving_multi_turn.py
+35
-5
No files found.
benchmarks/multi_turn/benchmark_serving_multi_turn.py
View file @
3380543b
...
...
@@ -63,6 +63,7 @@ class RequestArgs(NamedTuple):
stream
:
bool
limit_min_tokens
:
int
# Use negative value for no limit
limit_max_tokens
:
int
# Use negative value for no limit
timeout_sec
:
int
class
BenchmarkArgs
(
NamedTuple
):
...
...
@@ -214,6 +215,7 @@ async def send_request(
stream
:
bool
=
True
,
min_tokens
:
int
|
None
=
None
,
max_tokens
:
int
|
None
=
None
,
timeout_sec
:
int
=
120
,
)
->
ServerResponse
:
payload
=
{
"model"
:
model
,
...
...
@@ -235,10 +237,16 @@ async def send_request(
headers
=
{
"Content-Type"
:
"application/json"
}
# Calculate the timeout for the request
timeout_sec
=
120
if
max_tokens
is
not
None
:
# Assume TPOT of 200ms and use max_tokens to determine timeout
timeout_sec
=
max
(
timeout_sec
,
int
(
max_tokens
*
0.2
))
token_based_timeout
=
int
(
max_tokens
*
0.2
)
if
token_based_timeout
>
timeout_sec
:
timeout_sec
=
token_based_timeout
logger
.
info
(
"Using timeout of %ds based on max_tokens %d"
,
timeout_sec
,
max_tokens
,
)
timeout
=
aiohttp
.
ClientTimeout
(
total
=
timeout_sec
)
valid_response
=
True
...
...
@@ -409,6 +417,7 @@ async def send_turn(
req_args
.
stream
,
min_tokens
,
max_tokens
,
req_args
.
timeout_sec
,
)
if
response
.
valid
is
False
:
...
...
@@ -676,8 +685,18 @@ async def client_main(
except
asyncio
.
exceptions
.
TimeoutError
:
num_failures
+=
1
logger
.
exception
(
f
"
{
Color
.
RED
}
Client
{
client_id
}
- Timeout during conversation ID
{
conv_id
}
(turn:
{
current_turn
}
)
{
Color
.
RESET
}
"
# noqa: E501
logger
.
error
(
"%sClient %d - Timeout during conversation ID %s (turn: %d). "
"Base timeout is %ss (set with --request-timeout-sec), but the "
"effective timeout may be longer based on max_tokens. If this "
"is unexpected, consider increasing the timeout or checking "
"model performance.%s"
,
Color
.
RED
,
client_id
,
conv_id
,
current_turn
,
req_args
.
timeout_sec
,
Color
.
RESET
,
)
break
# Exit gracefully instead of raising an error
...
...
@@ -815,6 +834,9 @@ def get_client_config(
"Invalid min/max tokens limits (min should not be larger than max)"
)
if
args
.
request_timeout_sec
<=
0
:
raise
ValueError
(
"Request timeout must be a positive number"
)
# Arguments for API requests
chat_url
=
f
"
{
args
.
url
}
/v1/chat/completions"
model_name
=
args
.
served_model_name
if
args
.
served_model_name
else
args
.
model
...
...
@@ -825,6 +847,7 @@ def get_client_config(
stream
=
not
args
.
no_stream
,
limit_min_tokens
=
args
.
limit_min_tokens
,
limit_max_tokens
=
args
.
limit_max_tokens
,
timeout_sec
=
args
.
request_timeout_sec
,
)
return
client_args
,
req_args
...
...
@@ -968,7 +991,7 @@ async def main_mp(
f
"(is alive:
{
client
.
is_alive
()
}
)
{
Color
.
RESET
}
"
)
client
.
join
(
timeout
=
120
)
client
.
join
(
timeout
=
req_args
.
timeout_sec
+
1
)
if
client
.
is_alive
():
logger
.
warning
(
...
...
@@ -1351,6 +1374,13 @@ async def main() -> None:
action
=
"store_true"
,
help
=
"Verify the LLM output (compare to the answers in the input JSON file)"
,
)
parser
.
add_argument
(
"--request-timeout-sec"
,
type
=
int
,
default
=
120
,
help
=
"Timeout in seconds for each API request (default: 120). "
"Automatically increased if max tokens imply longer decoding."
,
)
parser
.
add_argument
(
"--no-stream"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment