Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
baed180a
Unverified
Commit
baed180a
authored
Jul 08, 2025
by
kourosh hakhamaneshi
Committed by
GitHub
Jul 09, 2025
Browse files
[tech debt] Revisit lora request model checker (#20636)
Signed-off-by:
Kourosh Hakhamaneshi
<
kourosh@anyscale.com
>
parent
0b407479
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
65 additions
and
62 deletions
+65
-62
tests/entrypoints/openai/test_serving_models.py
tests/entrypoints/openai/test_serving_models.py
+2
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+3
-6
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+60
-55
No files found.
tests/entrypoints/openai/test_serving_models.py
View file @
baed180a
...
@@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
...
@@ -57,7 +57,8 @@ async def test_load_lora_adapter_success():
response
=
await
serving_models
.
load_lora_adapter
(
request
)
response
=
await
serving_models
.
load_lora_adapter
(
request
)
assert
response
==
LORA_LOADING_SUCCESS_MESSAGE
.
format
(
lora_name
=
'adapter'
)
assert
response
==
LORA_LOADING_SUCCESS_MESSAGE
.
format
(
lora_name
=
'adapter'
)
assert
len
(
serving_models
.
lora_requests
)
==
1
assert
len
(
serving_models
.
lora_requests
)
==
1
assert
serving_models
.
lora_requests
[
0
].
lora_name
==
"adapter"
assert
"adapter"
in
serving_models
.
lora_requests
assert
serving_models
.
lora_requests
[
"adapter"
].
lora_name
==
"adapter"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
baed180a
...
@@ -438,9 +438,7 @@ class OpenAIServing:
...
@@ -438,9 +438,7 @@ class OpenAIServing:
if
self
.
_is_model_supported
(
request
.
model
):
if
self
.
_is_model_supported
(
request
.
model
):
return
None
return
None
if
request
.
model
in
[
if
request
.
model
in
self
.
models
.
lora_requests
:
lora
.
lora_name
for
lora
in
self
.
models
.
lora_requests
]:
return
None
return
None
if
envs
.
VLLM_ALLOW_RUNTIME_LORA_UPDATING
and
request
.
model
and
(
if
envs
.
VLLM_ALLOW_RUNTIME_LORA_UPDATING
and
request
.
model
and
(
load_result
:
=
await
self
.
models
.
resolve_lora
(
request
.
model
)):
load_result
:
=
await
self
.
models
.
resolve_lora
(
request
.
model
)):
...
@@ -466,9 +464,8 @@ class OpenAIServing:
...
@@ -466,9 +464,8 @@ class OpenAIServing:
None
,
PromptAdapterRequest
]]:
None
,
PromptAdapterRequest
]]:
if
self
.
_is_model_supported
(
request
.
model
):
if
self
.
_is_model_supported
(
request
.
model
):
return
None
,
None
return
None
,
None
for
lora
in
self
.
models
.
lora_requests
:
if
request
.
model
in
self
.
models
.
lora_requests
:
if
request
.
model
==
lora
.
lora_name
:
return
self
.
models
.
lora_requests
[
request
.
model
],
None
return
lora
,
None
for
prompt_adapter
in
self
.
models
.
prompt_adapter_requests
:
for
prompt_adapter
in
self
.
models
.
prompt_adapter_requests
:
if
request
.
model
==
prompt_adapter
.
prompt_adapter_name
:
if
request
.
model
==
prompt_adapter
.
prompt_adapter_name
:
return
None
,
prompt_adapter
return
None
,
prompt_adapter
...
...
vllm/entrypoints/openai/serving_models.py
View file @
baed180a
...
@@ -65,12 +65,13 @@ class OpenAIServingModels:
...
@@ -65,12 +65,13 @@ class OpenAIServingModels:
super
().
__init__
()
super
().
__init__
()
self
.
base_model_paths
=
base_model_paths
self
.
base_model_paths
=
base_model_paths
self
.
max_model_len
=
model_config
.
max_model_len
self
.
max_model_len
=
model_config
.
max_model_len
self
.
engine_client
=
engine_client
self
.
engine_client
=
engine_client
self
.
model_config
=
model_config
self
.
model_config
=
model_config
self
.
static_lora_modules
=
lora_modules
self
.
static_lora_modules
=
lora_modules
self
.
lora_requests
:
list
[
LoRARequest
]
=
[]
self
.
lora_requests
:
dict
[
str
,
LoRARequest
]
=
{}
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
lora_resolvers
:
list
[
LoRAResolver
]
=
[]
self
.
lora_resolvers
:
list
[
LoRAResolver
]
=
[]
...
@@ -138,7 +139,7 @@ class OpenAIServingModels:
...
@@ -138,7 +139,7 @@ class OpenAIServingModels:
parent
=
lora
.
base_model_name
if
lora
.
base_model_name
else
parent
=
lora
.
base_model_name
if
lora
.
base_model_name
else
self
.
base_model_paths
[
0
].
name
,
self
.
base_model_paths
[
0
].
name
,
permission
=
[
ModelPermission
()])
permission
=
[
ModelPermission
()])
for
lora
in
self
.
lora_requests
for
lora
in
self
.
lora_requests
.
values
()
]
]
prompt_adapter_cards
=
[
prompt_adapter_cards
=
[
ModelCard
(
id
=
prompt_adapter
.
prompt_adapter_name
,
ModelCard
(
id
=
prompt_adapter
.
prompt_adapter_name
,
...
@@ -155,23 +156,29 @@ class OpenAIServingModels:
...
@@ -155,23 +156,29 @@ class OpenAIServingModels:
request
:
LoadLoRAAdapterRequest
,
request
:
LoadLoRAAdapterRequest
,
base_model_name
:
Optional
[
str
]
=
None
base_model_name
:
Optional
[
str
]
=
None
)
->
Union
[
ErrorResponse
,
str
]:
)
->
Union
[
ErrorResponse
,
str
]:
error_check_ret
=
await
self
.
_check_load_lora_adapter_request
(
request
)
lora_name
=
request
.
lora_name
# Ensure atomicity based on the lora name
async
with
self
.
lora_resolver_lock
[
lora_name
]:
error_check_ret
=
await
self
.
_check_load_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
lora_name
,
lora_path
=
request
.
lora_name
,
request
.
lora_path
lora_path
=
request
.
lora_path
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
lora_request
=
LoRARequest
(
lora_name
=
lora_name
,
lora_request
=
LoRARequest
(
lora_name
=
lora_name
,
lora_int_id
=
unique_id
,
lora_int_id
=
unique_id
,
lora_path
=
lora_path
)
lora_path
=
lora_path
)
if
base_model_name
is
not
None
and
self
.
is_base_model
(
base_model_name
):
if
base_model_name
is
not
None
and
self
.
is_base_model
(
base_model_name
):
lora_request
.
base_model_name
=
base_model_name
lora_request
.
base_model_name
=
base_model_name
# Validate that the adapter can be loaded into the engine
# Validate that the adapter can be loaded into the engine
# This will also pre-load it for incoming requests
# This will also pre-load it for incoming requests
try
:
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
await
self
.
engine_client
.
add_lora
(
lora_request
)
except
Base
Exception
as
e
:
except
Exception
as
e
:
error_type
=
"BadRequestError"
error_type
=
"BadRequestError"
status_code
=
HTTPStatus
.
BAD_REQUEST
status_code
=
HTTPStatus
.
BAD_REQUEST
if
"No adapter found"
in
str
(
e
):
if
"No adapter found"
in
str
(
e
):
...
@@ -182,24 +189,25 @@ class OpenAIServingModels:
...
@@ -182,24 +189,25 @@ class OpenAIServingModels:
err_type
=
error_type
,
err_type
=
error_type
,
status_code
=
status_code
)
status_code
=
status_code
)
self
.
lora_requests
.
append
(
lora_request
)
self
.
lora_requests
[
lora_name
]
=
lora_request
logger
.
info
(
"Loaded new LoRA adapter: name '%s', path '%s'"
,
lora_name
,
logger
.
info
(
"Loaded new LoRA adapter: name '%s', path '%s'"
,
lora_path
)
lora_name
,
lora_path
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' added successfully."
return
f
"Success: LoRA adapter '
{
lora_name
}
' added successfully."
async
def
unload_lora_adapter
(
async
def
unload_lora_adapter
(
self
,
self
,
request
:
UnloadLoRAAdapterRequest
)
->
Union
[
ErrorResponse
,
str
]:
request
:
UnloadLoRAAdapterRequest
)
->
Union
[
ErrorResponse
,
str
]:
error_check_ret
=
await
self
.
_check_unload_lora_adapter_request
(
request
lora_name
=
request
.
lora_name
)
# Ensure atomicity based on the lora name
async
with
self
.
lora_resolver_lock
[
lora_name
]:
error_check_ret
=
await
self
.
_check_unload_lora_adapter_request
(
request
)
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
lora_name
=
request
.
lora_name
# Safe to delete now since we hold the lock
self
.
lora_requests
=
[
del
self
.
lora_requests
[
lora_name
]
lora_request
for
lora_request
in
self
.
lora_requests
if
lora_request
.
lora_name
!=
lora_name
]
logger
.
info
(
"Removed LoRA adapter: name '%s'"
,
lora_name
)
logger
.
info
(
"Removed LoRA adapter: name '%s'"
,
lora_name
)
return
f
"Success: LoRA adapter '
{
lora_name
}
' removed successfully."
return
f
"Success: LoRA adapter '
{
lora_name
}
' removed successfully."
...
@@ -213,8 +221,7 @@ class OpenAIServingModels:
...
@@ -213,8 +221,7 @@ class OpenAIServingModels:
status_code
=
HTTPStatus
.
BAD_REQUEST
)
status_code
=
HTTPStatus
.
BAD_REQUEST
)
# Check if the lora adapter with the given name already exists
# Check if the lora adapter with the given name already exists
if
any
(
lora_request
.
lora_name
==
request
.
lora_name
if
request
.
lora_name
in
self
.
lora_requests
:
for
lora_request
in
self
.
lora_requests
):
return
create_error_response
(
return
create_error_response
(
message
=
message
=
f
"The lora adapter '
{
request
.
lora_name
}
' has already been "
f
"The lora adapter '
{
request
.
lora_name
}
' has already been "
...
@@ -227,17 +234,16 @@ class OpenAIServingModels:
...
@@ -227,17 +234,16 @@ class OpenAIServingModels:
async
def
_check_unload_lora_adapter_request
(
async
def
_check_unload_lora_adapter_request
(
self
,
self
,
request
:
UnloadLoRAAdapterRequest
)
->
Optional
[
ErrorResponse
]:
request
:
UnloadLoRAAdapterRequest
)
->
Optional
[
ErrorResponse
]:
# Check if
either
'lora_name'
or 'lora_int_id' is provided
# Check if 'lora_name'
is not provided return an error
if
not
request
.
lora_name
and
not
request
.
lora_int_id
:
if
not
request
.
lora_name
:
return
create_error_response
(
return
create_error_response
(
message
=
message
=
"
either
'lora_name'
and 'lora_int_id'
needs to be provided."
,
"'lora_name' needs to be provided
to unload a LoRA adapter
."
,
err_type
=
"InvalidUserInput"
,
err_type
=
"InvalidUserInput"
,
status_code
=
HTTPStatus
.
BAD_REQUEST
)
status_code
=
HTTPStatus
.
BAD_REQUEST
)
# Check if the lora adapter with the given name exists
# Check if the lora adapter with the given name exists
if
not
any
(
lora_request
.
lora_name
==
request
.
lora_name
if
request
.
lora_name
not
in
self
.
lora_requests
:
for
lora_request
in
self
.
lora_requests
):
return
create_error_response
(
return
create_error_response
(
message
=
message
=
f
"The lora adapter '
{
request
.
lora_name
}
' cannot be found."
,
f
"The lora adapter '
{
request
.
lora_name
}
' cannot be found."
,
...
@@ -260,9 +266,8 @@ class OpenAIServingModels:
...
@@ -260,9 +266,8 @@ class OpenAIServingModels:
"""
"""
async
with
self
.
lora_resolver_lock
[
lora_name
]:
async
with
self
.
lora_resolver_lock
[
lora_name
]:
# First check if this LoRA is already loaded
# First check if this LoRA is already loaded
for
existing
in
self
.
lora_requests
:
if
lora_name
in
self
.
lora_requests
:
if
existing
.
lora_name
==
lora_name
:
return
self
.
lora_requests
[
lora_name
]
return
existing
base_model_name
=
self
.
model_config
.
model
base_model_name
=
self
.
model_config
.
model
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
unique_id
=
self
.
lora_id_counter
.
inc
(
1
)
...
@@ -279,7 +284,7 @@ class OpenAIServingModels:
...
@@ -279,7 +284,7 @@ class OpenAIServingModels:
try
:
try
:
await
self
.
engine_client
.
add_lora
(
lora_request
)
await
self
.
engine_client
.
add_lora
(
lora_request
)
self
.
lora_requests
.
append
(
lora_request
)
self
.
lora_requests
[
lora_name
]
=
lora_request
logger
.
info
(
logger
.
info
(
"Resolved and loaded LoRA adapter '%s' using %s"
,
"Resolved and loaded LoRA adapter '%s' using %s"
,
lora_name
,
resolver
.
__class__
.
__name__
)
lora_name
,
resolver
.
__class__
.
__name__
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment