Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
baed180a
Unverified
Commit
baed180a
authored
Jul 08, 2025
by
kourosh hakhamaneshi
Committed by
GitHub
Jul 09, 2025
Browse files
[tech debt] Revisit lora request model checker (#20636)
Signed-off-by:
Kourosh Hakhamaneshi
<
kourosh@anyscale.com
>
parent
0b407479
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
65 additions
and
62 deletions
+65
-62
tests/entrypoints/openai/test_serving_models.py
tests/entrypoints/openai/test_serving_models.py
+2
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+3
-6
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+60
-55
No files found.
tests/entrypoints/openai/test_serving_models.py
View file @
baed180a
...
...
@@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
response
=
await
serving_models
.
load_lora_adapter
(
request
)
assert
response
==
LORA_LOADING_SUCCESS_MESSAGE
.
format
(
lora_name
=
'adapter'
)
assert
len
(
serving_models
.
lora_requests
)
==
1
assert
serving_models
.
lora_requests
[
0
].
lora_name
==
"adapter"
assert
"adapter"
in
serving_models
.
lora_requests
assert
serving_models
.
lora_requests
[
"adapter"
].
lora_name
==
"adapter"
@
pytest
.
mark
.
asyncio
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
baed180a
...
...
@@ -438,9 +438,7 @@ class OpenAIServing:
if
self
.
_is_model_supported
(
request
.
model
):
return
None
if
request
.
model
in
[
lora
.
lora_name
for
lora
in
self
.
models
.
lora_requests
]:
if
request
.
model
in
self
.
models
.
lora_requests
:
return
None
if
envs
.
VLLM_ALLOW_RUNTIME_LORA_UPDATING
and
request
.
model
and
(
load_result
:
=
await
self
.
models
.
resolve_lora
(
request
.
model
)):
...
...
@@ -466,9 +464,8 @@ class OpenAIServing:
None
,
PromptAdapterRequest
]]:
if
self
.
_is_model_supported
(
request
.
model
):
return
None
,
None
for
lora
in
self
.
models
.
lora_requests
:
if
request
.
model
==
lora
.
lora_name
:
return
lora
,
None
if
request
.
model
in
self
.
models
.
lora_requests
:
return
self
.
models
.
lora_requests
[
request
.
model
],
None
for
prompt_adapter
in
self
.
models
.
prompt_adapter_requests
:
if
request
.
model
==
prompt_adapter
.
prompt_adapter_name
:
return
None
,
prompt_adapter
...
...
vllm/entrypoints/openai/serving_models.py
View file @
baed180a
...
...
@@ -65,12 +65,13 @@ class OpenAIServingModels:
super
().
__init__
()
self
.
base_model_paths
=
base_model_paths
self
.
max_model_len
=
model_config
.
max_model_len
self
.
engine_client
=
engine_client
self
.
model_config
=
model_config
self
.
static_lora_modules
=
lora_modules
self
.
lora_requests
:
list
[
LoRARequest
]
=
[]
self
.
lora_requests
:
dict
[
str
,
LoRARequest
]
=
{}
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
lora_resolvers
:
list
[
LoRAResolver
]
=
[]
...
...
@@ -138,7 +139,7 @@ class OpenAIServingModels:
parent
=
lora
.
base_model_name
if
lora
.
base_model_name
else
self
.
base_model_paths
[
0
].
name
,
permission
=
[
ModelPermission
()])
for
lora
in
self
.
lora_requests
for
lora
in
self
.
lora_requests
.
values
()
]
prompt_adapter_cards
=
[
ModelCard
(
id
=
prompt_adapter
.
prompt_adapter_name
,
...
...
@@ -155,53 +156,60 @@ class OpenAIServingModels:
request
:
LoadLoRAAdapterRequest
,
base_model_name
:
Optional
[
str
]
=
None
)
->
Union
[
ErrorResponse
,
str
]:
error_check_ret
=
await
self
.
_check_load_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
lora_name
,
lora_path
=
request
.
lora_name
,
request
.
lora_path
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
lora_request
=
LoRARequest
(
lora_name
=
lora_name
,
lora_int_id
=
unique_id
,
lora_path
=
lora_path
)
if
base_model_name
is
not
None
and
self
.
is_base_model
(
base_model_name
):
lora_request
.
base_model_name
=
base_model_name
# Validate that the adapter can be loaded into the engine
# This will also pre-load it for incoming requests
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
except
BaseException
as
e
:
error_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
if
"No adapter found"
in
str
(
e
):
error_type
=
"NotFoundError"
status_code
=
HTTPStatus
.
NOT_FOUND
return
create_error_response
(
message
=
str
(
e
),
err_type
=
error_type
,
status_code
=
status_code
)
self
.
lora_requests
.
append
(
lora_request
)
logger
.
info
(
"Loaded new LoRA adapter: name '%s', path '%s'"
,
lora_name
,
lora_path
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' added successfully."
lora_name
=
request
.
lora_name
# Ensure atomicity based on the lora name
async
with
self
.
lora_resolver_lock
[
lora_name
]:
error_check_ret
=
await
self
.
_check_load_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
lora_path
=
request
.
lora_path
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
lora_request
=
LoRARequest
(
lora_name
=
lora_name
,
lora_int_id
=
unique_id
,
lora_path
=
lora_path
)
if
base_model_name
is
not
None
and
self
.
is_base_model
(
base_model_name
):
lora_request
.
base_model_name
=
base_model_name
# Validate that the adapter can be loaded into the engine
# This will also pre-load it for incoming requests
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
except
Exception
as
e
:
error_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
if
"No adapter found"
in
str
(
e
):
error_type
=
"NotFoundError"
status_code
=
HTTPStatus
.
NOT_FOUND
return
create_error_response
(
message
=
str
(
e
),
err_type
=
error_type
,
status_code
=
status_code
)
self
.
lora_requests
[
lora_name
]
=
lora_request
logger
.
info
(
"Loaded new LoRA adapter: name '%s', path '%s'"
,
lora_name
,
lora_path
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' added successfully."
async
def
unload_lora_adapter
(
self
,
request
:
UnloadLoRAAdapterRequest
)
->
Union
[
ErrorResponse
,
str
]:
error_check_ret
=
await
self
.
_check_unload_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
lora_name
=
request
.
lora_name
self
.
lora_requests
=
[
lora_request
for
lora_request
in
self
.
lora_requests
if
lora_request
.
lora_name
!=
lora_name
]
logger
.
info
(
"Removed LoRA adapter: name '%s'"
,
lora_name
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' removed successfully."
# Ensure atomicity based on the lora name
async
with
self
.
lora_resolver_lock
[
lora_name
]:
error_check_ret
=
await
self
.
_check_unload_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
# Safe to delete now since we hold the lock
del
self
.
lora_requests
[
lora_name
]
logger
.
info
(
"Removed LoRA adapter: name '%s'"
,
lora_name
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' removed successfully."
async
def
_check_load_lora_adapter_request
(
self
,
request
:
LoadLoRAAdapterRequest
)
->
Optional
[
ErrorResponse
]:
...
...
@@ -213,8 +221,7 @@ class OpenAIServingModels:
status_code
=
HTTPStatus
.
BAD_REQUEST
)
# Check if the lora adapter with the given name already exists
if
any
(
lora_request
.
lora_name
==
request
.
lora_name
for
lora_request
in
self
.
lora_requests
):
if
request
.
lora_name
in
self
.
lora_requests
:
return
create_error_response
(
message
=
f
"The lora adapter '
{
request
.
lora_name
}
' has already been "
...
...
@@ -227,17 +234,16 @@ class OpenAIServingModels:
async
def
_check_unload_lora_adapter_request
(
self
,
request
:
UnloadLoRAAdapterRequest
)
->
Optional
[
ErrorResponse
]:
# Check if
either
'lora_name'
or 'lora_int_id' is provided
if
not
request
.
lora_name
and
not
request
.
lora_int_id
:
# Check if 'lora_name'
is not provided return an error
if
not
request
.
lora_name
:
return
create_error_response
(
message
=
"
either
'lora_name'
and 'lora_int_id'
needs to be provided."
,
"'lora_name' needs to be provided
to unload a LoRA adapter
."
,
err_type
=
"InvalidUserInput"
,
status_code
=
HTTPStatus
.
BAD_REQUEST
)
# Check if the lora adapter with the given name exists
if
not
any
(
lora_request
.
lora_name
==
request
.
lora_name
for
lora_request
in
self
.
lora_requests
):
if
request
.
lora_name
not
in
self
.
lora_requests
:
return
create_error_response
(
message
=
f
"The lora adapter '
{
request
.
lora_name
}
' cannot be found."
,
...
...
@@ -260,9 +266,8 @@ class OpenAIServingModels:
"""
async
with
self
.
lora_resolver_lock
[
lora_name
]:
# First check if this LoRA is already loaded
for
existing
in
self
.
lora_requests
:
if
existing
.
lora_name
==
lora_name
:
return
existing
if
lora_name
in
self
.
lora_requests
:
return
self
.
lora_requests
[
lora_name
]
base_model_name
=
self
.
model_config
.
model
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
...
...
@@ -279,7 +284,7 @@ class OpenAIServingModels:
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
self
.
lora_requests
.
append
(
lora_request
)
self
.
lora_requests
[
lora_name
]
=
lora_request
logger
.
info
(
"Resolved and loaded LoRA adapter '%s' using %s"
,
lora_name
,
resolver
.
__class__
.
__name__
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment