Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
95a6568b
Unverified
Commit
95a6568b
authored
Jun 09, 2025
by
Jee Jee Li
Committed by
GitHub
Jun 09, 2025
Browse files
[CI/Build] Fix LoRA test (#19350)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
0eca5eac
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
11 additions
and
63 deletions
+11
-63
tests/lora/conftest.py
tests/lora/conftest.py
+0
-5
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+0
-34
tests/lora/test_lora_functions.py
tests/lora/test_lora_functions.py
+0
-7
tests/lora/test_phi.py
tests/lora/test_phi.py
+1
-9
tests/lora/test_worker.py
tests/lora/test_worker.py
+10
-8
No files found.
tests/lora/conftest.py
View file @
95a6568b
...
...
@@ -164,11 +164,6 @@ def mixtral_lora_files():
return
snapshot_download
(
repo_id
=
"SangBinCho/mixtral-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
gemma_lora_files
():
return
snapshot_download
(
repo_id
=
"wskwon/gemma-7b-test-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
chatglm3_lora_files
():
return
snapshot_download
(
repo_id
=
"jeeejeee/chatglm3-text2sql-spider"
)
...
...
tests/lora/test_llama_tp.py
View file @
95a6568b
...
...
@@ -4,9 +4,6 @@ import subprocess
import
sys
from
typing
import
Union
import
pytest
import
ray
import
vllm
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
...
...
@@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
generate_and_test
(
llm
,
sql_lora_files
)
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@
pytest
.
mark
.
skip_v1
@
create_new_process_for_each_test
()
def
test_llama_lora_warmup
(
sql_lora_files
):
"""Test that the LLM initialization works with a warmup LORA path and
is more conservative"""
@
ray
.
remote
(
num_gpus
=
1
)
def
get_num_gpu_blocks_lora
():
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
max_num_seqs
=
16
)
num_gpu_blocks_lora_warmup
=
llm
.
llm_engine
.
cache_config
.
num_gpu_blocks
return
num_gpu_blocks_lora_warmup
@
ray
.
remote
(
num_gpus
=
1
)
def
get_num_gpu_blocks_no_lora
():
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_num_seqs
=
16
)
num_gpu_blocks_no_lora_warmup
=
(
llm
.
llm_engine
.
cache_config
.
num_gpu_blocks
)
return
num_gpu_blocks_no_lora_warmup
num_gpu_blocks_lora_warmup
=
ray
.
get
(
get_num_gpu_blocks_lora
.
remote
())
num_gpu_blocks_no_lora_warmup
=
ray
.
get
(
get_num_gpu_blocks_no_lora
.
remote
())
assert
num_gpu_blocks_lora_warmup
<
num_gpu_blocks_no_lora_warmup
,
(
"The warmup with lora should be more "
"conservative than without lora, therefore the number of "
"memory blocks for the KV cache should be "
"less when using lora than when not using lora"
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4
(
sql_lora_files
):
...
...
tests/lora/test_lora_functions.py
View file @
95a6568b
...
...
@@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_PATH
=
"yard1/llama-2-7b-sql-lora-test"
LORA_RANK
=
8
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines_lora):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass
def
make_lora_request
(
lora_id
:
int
):
return
LoRARequest
(
lora_name
=
f
"
{
lora_id
}
"
,
...
...
tests/lora/test_phi.py
View file @
95a6568b
...
...
@@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
PROMPT_TEMPLATE
=
"### Instruct: {sql_prompt}
\n\n
### Context: {context}
\n\n
### Output:"
# noqa: E501
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
...
...
@@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@
pytest
.
mark
.
skip
_v1
@
pytest
.
mark
.
skip
(
reason
=
"Head size 80 is not supported by FlashAttention"
)
def
test_phi2_lora
(
phi2_lora_files
):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
...
...
tests/lora/test_worker.py
View file @
95a6568b
...
...
@@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest
from
vllm.v1.worker.gpu_worker
import
Worker
as
V1Worker
from
vllm.worker.worker
import
Worker
NUM_LORAS
=
16
@
patch
.
dict
(
os
.
environ
,
{
"RANK"
:
"0"
})
def
test_worker_apply_lora
(
sql_lora_files
):
...
...
@@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.0
,
swap_space
=
0
,
cache_dtype
=
"auto"
,
),
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
32
,
max_loras
=
32
),
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
NUM_LORAS
,
max_loras
=
NUM_LORAS
),
)
worker
=
worker_cls
(
vllm_config
=
vllm_config
,
...
...
@@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
set_active_loras
(
worker
,
[])
assert
worker
.
list_loras
()
==
set
()
n_loras
=
32
lora_requests
=
[
LoRARequest
(
str
(
i
+
1
),
i
+
1
,
sql_lora_files
)
for
i
in
range
(
n_loras
)
LoRARequest
(
str
(
i
+
1
),
i
+
1
,
sql_lora_files
)
for
i
in
range
(
NUM_LORAS
)
]
set_active_loras
(
worker
,
lora_requests
)
...
...
@@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
for
lora_request
in
lora_requests
}
for
i
in
range
(
32
):
for
i
in
range
(
NUM_LORAS
):
random
.
seed
(
i
)
iter_lora_requests
=
random
.
choices
(
lora_requests
,
k
=
random
.
randint
(
1
,
n_loras
))
k
=
random
.
randint
(
1
,
NUM_LORAS
))
random
.
shuffle
(
iter_lora_requests
)
iter_lora_requests
=
iter_lora_requests
[:
-
random
.
randint
(
0
,
n_loras
)]
iter_lora_requests
=
iter_lora_requests
[:
-
random
.
randint
(
0
,
NUM_LORAS
)]
set_active_loras
(
worker
,
lora_requests
)
assert
worker
.
list_loras
().
issuperset
(
{
lora_request
.
lora_int_id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment