Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0ce743f4
Unverified
Commit
0ce743f4
authored
Nov 03, 2025
by
Vensen
Committed by
GitHub
Nov 02, 2025
Browse files
Fix(llm): Abort orphaned requests when llm.chat() batch fails Fixes #26081 (#27420)
Signed-off-by:
vensenmu
<
vensenmu@gmail.com
>
parent
6c317a65
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
75 additions
and
14 deletions
+75
-14
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+53
-0
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+22
-14
No files found.
tests/entrypoints/llm/test_chat.py
View file @
0ce743f4
...
...
@@ -6,6 +6,7 @@ import pytest
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.sampling_params
import
SamplingParams
from
..openai.test_vision
import
TEST_IMAGE_ASSETS
...
...
@@ -23,6 +24,29 @@ def text_llm():
cleanup_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"function"
)
def
llm_for_failure_test
():
"""
Fixture for testing issue #26081.
Uses a small max_model_len to easily trigger length errors.
"""
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
seed
=
0
,
max_model_len
=
128
,
disable_log_stats
=
True
,
)
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup_dist_env_and_memory
()
def
test_chat
(
text_llm
):
prompt1
=
"Explain the concept of entropy."
messages
=
[
...
...
@@ -157,3 +181,32 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking):
else
:
# The chat template includes dummy thinking process
assert
think_id
in
prompt_token_ids
def
test_chat_batch_failure_cleanup
(
llm_for_failure_test
):
"""
Tests that if a batch call to llm.chat() fails mid-way
(e.g., due to one invalid prompt), the requests that
were already enqueued are properly aborted and do not
pollute the queue for subsequent calls.
(Fixes Issue #26081)
"""
llm
=
llm_for_failure_test
valid_msg
=
[{
"role"
:
"user"
,
"content"
:
"Hello"
}]
long_text
=
"This is a very long text to test the error "
*
50
invalid_msg
=
[{
"role"
:
"user"
,
"content"
:
long_text
}]
batch_1
=
[
valid_msg
,
valid_msg
,
invalid_msg
,
]
batch_2
=
[
valid_msg
,
valid_msg
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
10
)
with
pytest
.
raises
(
ValueError
,
match
=
"longer than the maximum model length"
):
llm
.
chat
(
batch_1
,
sampling_params
=
sampling_params
)
outputs_2
=
llm
.
chat
(
batch_2
,
sampling_params
=
sampling_params
)
assert
len
(
outputs_2
)
==
len
(
batch_2
)
assert
llm
.
llm_engine
.
get_num_unfinished_requests
()
==
0
vllm/entrypoints/llm.py
View file @
0ce743f4
...
...
@@ -1588,20 +1588,27 @@ class LLM:
tqdm_func
=
use_tqdm
if
callable
(
use_tqdm
)
else
tqdm
it
=
tqdm_func
(
it
,
desc
=
"Adding requests"
)
for
i
,
prompt
in
enumerate
(
it
):
if
isinstance
(
prompt
,
dict
):
self
.
_validate_mm_data_and_uuids
(
prompt
.
get
(
"multi_modal_data"
),
prompt
.
get
(
"multi_modal_uuids"
)
)
added_request_ids
:
list
[
str
]
=
[]
self
.
_add_request
(
prompt
,
params
[
i
]
if
isinstance
(
params
,
Sequence
)
else
params
,
lora_request
=
lora_request
[
i
]
if
isinstance
(
lora_request
,
Sequence
)
else
lora_request
,
priority
=
priority
[
i
]
if
priority
else
0
,
)
try
:
for
i
,
prompt
in
enumerate
(
it
):
if
isinstance
(
prompt
,
dict
):
self
.
_validate_mm_data_and_uuids
(
prompt
.
get
(
"multi_modal_data"
),
prompt
.
get
(
"multi_modal_uuids"
)
)
request_id
=
self
.
_add_request
(
prompt
,
params
[
i
]
if
isinstance
(
params
,
Sequence
)
else
params
,
lora_request
=
lora_request
[
i
]
if
isinstance
(
lora_request
,
Sequence
)
else
lora_request
,
priority
=
priority
[
i
]
if
priority
else
0
,
)
added_request_ids
.
append
(
request_id
)
except
Exception
as
e
:
if
added_request_ids
:
self
.
llm_engine
.
abort_request
(
added_request_ids
)
raise
e
def
_validate_mm_data_and_uuids
(
self
,
...
...
@@ -1684,7 +1691,7 @@ class LLM:
params
:
SamplingParams
|
PoolingParams
,
lora_request
:
LoRARequest
|
None
=
None
,
priority
:
int
=
0
,
)
->
None
:
)
->
str
:
prompt_text
,
_
,
_
=
get_prompt_components
(
prompt
)
request_id
=
str
(
next
(
self
.
request_counter
))
...
...
@@ -1705,6 +1712,7 @@ class LLM:
priority
=
priority
,
prompt_text
=
prompt_text
,
)
return
request_id
def
_run_engine
(
self
,
*
,
use_tqdm
:
bool
|
Callable
[...,
tqdm
]
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment