Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
3c2b72b0
Unverified
Commit
3c2b72b0
authored
Dec 11, 2025
by
Biswa Panda
Committed by
GitHub
Dec 11, 2025
Browse files
fix: [lora] refactor test and clean up examples (#4884)
parent
242a4d5b
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
109 additions
and
121 deletions
+109
-121
examples/backends/vllm/launch/lora/agg_lora.sh
examples/backends/vllm/launch/lora/agg_lora.sh
+1
-11
examples/backends/vllm/launch/lora/agg_lora_router.sh
examples/backends/vllm/launch/lora/agg_lora_router.sh
+1
-9
tests/serve/conftest.py
tests/serve/conftest.py
+2
-2
tests/serve/lora_utils.py
tests/serve/lora_utils.py
+14
-10
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+2
-89
tests/utils/payloads.py
tests/utils/payloads.py
+89
-0
No files found.
examples/backends/vllm/launch/lora/agg_lora.sh
View file @
3c2b72b0
...
...
@@ -4,14 +4,6 @@
set
-e
trap
'echo Cleaning up...; kill 0'
EXIT
# Follow the README.md instructions to setup MinIO or upload the LoRA to s3/minio
# Adjust these values to match your local MinIO or S3 setup
# load math lora to minio
# LORA_NAME=Neural-Hacker/Qwen3-Math-Reasoning-LoRA HF_LORA_REPO=Neural-Hacker/Qwen3-Math-Reasoning-LoRA ./setup_minio.sh
export
AWS_ENDPOINT
=
http://localhost:9000
export
AWS_ACCESS_KEY_ID
=
minioadmin
export
AWS_SECRET_ACCESS_KEY
=
minioadmin
...
...
@@ -21,8 +13,6 @@ export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export
DYN_LORA_ENABLED
=
true
export
DYN_LORA_PATH
=
/tmp/dynamo_loras_minio
export
DYN_LOG
=
debug
# export DYN_LOG_LEVEL=debug
mkdir
-p
$DYN_LORA_PATH
...
...
@@ -63,7 +53,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
-H
"Content-Type: application/json"
\
-d
'{
"model": "Qwen/Qwen3-0.6B",
"messages": [{"role": "user", "content": "
Solve (x*x - x + 1 = 0) for x
"}],
"messages": [{"role": "user", "content": "
What is deep learning?
"}],
"max_tokens": 300,
"temperature": 0.0
}'
...
...
examples/backends/vllm/launch/lora/agg_lora_router.sh
View file @
3c2b72b0
...
...
@@ -4,12 +4,6 @@
set
-e
trap
'echo Cleaning up...; kill 0'
EXIT
# Follow the README.md instructions to setup MinIO or upload the LoRA to s3/minio
# Adjust these values to match your local MinIO or S3 setup
# load math lora to minio
# LORA_NAME=Neural-Hacker/Qwen3-Math-Reasoning-LoRA HF_LORA_REPO=Neural-Hacker/Qwen3-Math-Reasoning-LoRA ./setup_minio.sh
export
AWS_ENDPOINT
=
http://localhost:9000
export
AWS_ACCESS_KEY_ID
=
minioadmin
export
AWS_SECRET_ACCESS_KEY
=
minioadmin
...
...
@@ -19,8 +13,6 @@ export AWS_ALLOW_HTTP=true
# Dynamo LoRA Configuration
export
DYN_LORA_ENABLED
=
true
export
DYN_LORA_PATH
=
/tmp/dynamo_loras_minio
export
DYN_LOG
=
debug
# export DYN_LOG_LEVEL=debug
mkdir
-p
$DYN_LORA_PATH
...
...
@@ -118,7 +110,7 @@ curl localhost:8000/v1/chat/completions \
"total_tokens"
: 226,
"prompt_tokens_details"
:
{
"audio_tokens"
: null,
"cached_tokens"
: 192
"cached_tokens"
: 192
# tokens that were cached from the previous request.
}
}
,
"nvext"
:
{
...
...
tests/serve/conftest.py
View file @
3c2b72b0
...
...
@@ -86,8 +86,8 @@ def minio_lora_service():
local_path
=
service
.
download_lora
()
service
.
upload_lora
(
local_path
)
# Clean up downloaded files (keep MinIO
running
)
service
.
cleanup_
temp
()
# Clean up downloaded files (keep MinIO
data intact
)
service
.
cleanup_
download
()
yield
config
...
...
tests/serve/lora_utils.py
View file @
3c2b72b0
...
...
@@ -61,7 +61,7 @@ class MinioService:
def
__init__
(
self
,
config
:
MinioLoraConfig
):
self
.
config
=
config
self
.
_logger
=
logging
.
getLogger
(
self
.
__class__
.
__name__
)
self
.
_temp_dir
:
Optional
[
str
]
=
None
self
.
_temp_
download_
dir
:
Optional
[
str
]
=
None
def
start
(
self
)
->
None
:
"""Start MinIO container"""
...
...
@@ -183,9 +183,9 @@ class MinioService:
def
download_lora
(
self
)
->
str
:
"""Download LoRA from Hugging Face Hub, returns temp directory path"""
self
.
_temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"lora_download_"
)
self
.
_temp_
download_
dir
=
tempfile
.
mkdtemp
(
prefix
=
"lora_download_"
)
self
.
_logger
.
info
(
f
"Downloading LoRA
{
self
.
config
.
lora_repo
}
to
{
self
.
_temp_dir
}
"
f
"Downloading LoRA
{
self
.
config
.
lora_repo
}
to
{
self
.
_temp_
download_
dir
}
"
)
result
=
subprocess
.
run
(
...
...
@@ -194,7 +194,7 @@ class MinioService:
"download"
,
self
.
config
.
lora_repo
,
"--local-dir"
,
self
.
_temp_dir
,
self
.
_temp_
download_
dir
,
"--local-dir-use-symlinks"
,
"False"
,
],
...
...
@@ -206,11 +206,11 @@ class MinioService:
raise
RuntimeError
(
f
"Failed to download LoRA:
{
result
.
stderr
}
"
)
# Clean up cache directory
cache_dir
=
os
.
path
.
join
(
self
.
_temp_dir
,
".cache"
)
cache_dir
=
os
.
path
.
join
(
self
.
_temp_
download_
dir
,
".cache"
)
if
os
.
path
.
exists
(
cache_dir
):
shutil
.
rmtree
(
cache_dir
)
return
self
.
_temp_dir
return
self
.
_temp_
download_
dir
def
upload_lora
(
self
,
local_path
:
str
)
->
None
:
"""Upload LoRA to MinIO"""
...
...
@@ -246,11 +246,15 @@ class MinioService:
if
result
.
returncode
!=
0
:
raise
RuntimeError
(
f
"Failed to upload LoRA:
{
result
.
stderr
}
"
)
def
cleanup_download
(
self
)
->
None
:
"""Clean up temporary download directory only"""
if
self
.
_temp_download_dir
and
os
.
path
.
exists
(
self
.
_temp_download_dir
):
shutil
.
rmtree
(
self
.
_temp_download_dir
)
self
.
_temp_download_dir
=
None
def
cleanup_temp
(
self
)
->
None
:
"""Clean up temporary directories"""
if
self
.
_temp_dir
and
os
.
path
.
exists
(
self
.
_temp_dir
):
shutil
.
rmtree
(
self
.
_temp_dir
)
self
.
_temp_dir
=
None
"""Clean up all temporary directories including MinIO data dir"""
self
.
cleanup_download
()
if
self
.
config
.
data_dir
and
os
.
path
.
exists
(
self
.
config
.
data_dir
):
shutil
.
rmtree
(
self
.
config
.
data_dir
,
ignore_errors
=
True
)
...
...
tests/serve/test_vllm.py
View file @
3c2b72b0
...
...
@@ -16,7 +16,7 @@ from tests.serve.common import (
run_serve_deployment
,
)
from
tests.serve.conftest
import
MULTIMODAL_IMG_PATH
,
MULTIMODAL_IMG_URL
from
tests.serve.lora_utils
import
MinioLoraConfig
,
load_lora_adapter
from
tests.serve.lora_utils
import
MinioLoraConfig
from
tests.utils.engine_process
import
EngineConfig
from
tests.utils.payload_builder
import
(
chat_payload
,
...
...
@@ -26,7 +26,7 @@ from tests.utils.payload_builder import (
completion_payload_with_logprobs
,
metric_payload_default
,
)
from
tests.utils.payloads
import
ChatPayload
,
ToolCallingChatPayload
from
tests.utils.payloads
import
LoraTest
ChatPayload
,
ToolCallingChatPayload
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -614,93 +614,6 @@ def test_multimodal_b64(request, runtime_services, predownload_models):
lora_dir
=
os
.
path
.
join
(
vllm_dir
,
"launch/lora"
)
class
LoraTestChatPayload
(
ChatPayload
):
"""
Chat payload that loads a LoRA adapter before sending inference requests.
This payload first loads the specified LoRA adapter via the system API,
then sends chat completion requests using the LoRA model.
"""
def
__init__
(
self
,
body
:
dict
,
lora_name
:
str
,
s3_uri
:
str
,
system_port
:
int
=
8081
,
repeat_count
:
int
=
1
,
expected_response
:
Optional
[
list
]
=
None
,
expected_log
:
Optional
[
list
]
=
None
,
timeout
:
int
=
60
,
):
super
().
__init__
(
body
=
body
,
repeat_count
=
repeat_count
,
expected_response
=
expected_response
or
[],
expected_log
=
expected_log
or
[],
timeout
=
timeout
,
)
self
.
system_port
=
system_port
self
.
lora_name
=
lora_name
self
.
s3_uri
=
s3_uri
self
.
_lora_loaded
=
False
def
_ensure_lora_loaded
(
self
)
->
None
:
"""Ensure the LoRA adapter is loaded before making inference requests"""
if
not
self
.
_lora_loaded
:
import
time
import
requests
load_lora_adapter
(
system_port
=
self
.
system_port
,
lora_name
=
self
.
lora_name
,
s3_uri
=
self
.
s3_uri
,
timeout
=
self
.
timeout
,
)
# Wait for the LoRA model to appear in /v1/models
models_url
=
f
"http://
{
self
.
host
}
:
{
self
.
port
}
/v1/models"
start_time
=
time
.
time
()
max_wait
=
60
# 1 minute timeout
logger
.
info
(
f
"Waiting for LoRA model '
{
self
.
lora_name
}
' to appear in /v1/models..."
)
while
time
.
time
()
-
start_time
<
max_wait
:
try
:
response
=
requests
.
get
(
models_url
,
timeout
=
5
)
if
response
.
status_code
==
200
:
data
=
response
.
json
()
models
=
data
.
get
(
"data"
,
[])
model_ids
=
[
m
.
get
(
"id"
,
""
)
for
m
in
models
]
if
self
.
lora_name
in
model_ids
:
logger
.
info
(
f
"LoRA model '
{
self
.
lora_name
}
' is now available"
)
self
.
_lora_loaded
=
True
return
logger
.
debug
(
f
"Available models:
{
model_ids
}
, waiting for '
{
self
.
lora_name
}
'..."
)
except
requests
.
RequestException
as
e
:
logger
.
debug
(
f
"Error checking /v1/models:
{
e
}
"
)
time
.
sleep
(
1
)
raise
RuntimeError
(
f
"Timeout: LoRA model '
{
self
.
lora_name
}
' did not appear in /v1/models within
{
max_wait
}
s"
)
def
url
(
self
)
->
str
:
"""Load LoRA before first request, then return URL"""
self
.
_ensure_lora_loaded
()
return
super
().
url
()
def
lora_chat_payload
(
lora_name
:
str
,
s3_uri
:
str
,
...
...
tests/utils/payloads.py
View file @
3c2b72b0
...
...
@@ -21,6 +21,8 @@ from copy import deepcopy
from
dataclasses
import
dataclass
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
import
requests
from
dynamo
import
prometheus_names
# type: ignore[attr-defined]
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -240,6 +242,93 @@ class ToolCallingChatPayload(ChatPayload):
logger
.
info
(
f
"Expected tool '
{
self
.
expected_tool_name
}
' was called"
)
@
dataclass
class
LoraTestChatPayload
(
ChatPayload
):
"""
Chat payload that loads a LoRA adapter before sending inference requests.
This payload first loads the specified LoRA adapter via the system API,
then sends chat completion requests using the LoRA model.
"""
def
__init__
(
self
,
body
:
dict
,
lora_name
:
str
,
s3_uri
:
str
,
system_port
:
int
=
8081
,
repeat_count
:
int
=
1
,
expected_response
:
Optional
[
list
]
=
None
,
expected_log
:
Optional
[
list
]
=
None
,
timeout
:
int
=
60
,
):
super
().
__init__
(
body
=
body
,
repeat_count
=
repeat_count
,
expected_response
=
expected_response
or
[],
expected_log
=
expected_log
or
[],
timeout
=
timeout
,
)
self
.
system_port
=
system_port
self
.
lora_name
=
lora_name
self
.
s3_uri
=
s3_uri
self
.
_lora_loaded
=
False
def
_ensure_lora_loaded
(
self
)
->
None
:
"""Ensure the LoRA adapter is loaded before making inference requests"""
if
not
self
.
_lora_loaded
:
# Import the load_lora_adapter function
# Note: This import is done here to avoid circular dependencies
from
tests.serve.lora_utils
import
load_lora_adapter
load_lora_adapter
(
system_port
=
self
.
system_port
,
lora_name
=
self
.
lora_name
,
s3_uri
=
self
.
s3_uri
,
timeout
=
self
.
timeout
,
)
# Wait for the LoRA model to appear in /v1/models
models_url
=
f
"http://
{
self
.
host
}
:
{
self
.
port
}
/v1/models"
start_time
=
time
.
time
()
logger
.
info
(
f
"Waiting for LoRA model '
{
self
.
lora_name
}
' to appear in /v1/models..."
)
while
time
.
time
()
-
start_time
<
self
.
timeout
:
try
:
response
=
requests
.
get
(
models_url
,
timeout
=
5
)
if
response
.
status_code
==
200
:
data
=
response
.
json
()
models
=
data
.
get
(
"data"
,
[])
model_ids
=
[
m
.
get
(
"id"
,
""
)
for
m
in
models
]
if
self
.
lora_name
in
model_ids
:
logger
.
info
(
f
"LoRA model '
{
self
.
lora_name
}
' is now available"
)
self
.
_lora_loaded
=
True
return
logger
.
debug
(
f
"Available models:
{
model_ids
}
, waiting for '
{
self
.
lora_name
}
'..."
)
except
requests
.
RequestException
as
e
:
logger
.
debug
(
f
"Error checking /v1/models:
{
e
}
"
)
time
.
sleep
(
1
)
raise
RuntimeError
(
f
"Timeout: LoRA model '
{
self
.
lora_name
}
' did not appear in /v1/models within
{
self
.
timeout
}
s"
)
def
url
(
self
)
->
str
:
"""Load LoRA before first request, then return URL"""
self
.
_ensure_lora_loaded
()
return
super
().
url
()
@
dataclass
class
CompletionPayload
(
BasePayload
):
"""Payload for completions endpoint."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment