Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
95a6568b
Unverified
Commit
95a6568b
authored
Jun 09, 2025
by
Jee Jee Li
Committed by
GitHub
Jun 09, 2025
Browse files
[CI/Build] Fix LoRA test (#19350)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
0eca5eac
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
11 additions
and
63 deletions
+11
-63
tests/lora/conftest.py
tests/lora/conftest.py
+0
-5
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+0
-34
tests/lora/test_lora_functions.py
tests/lora/test_lora_functions.py
+0
-7
tests/lora/test_phi.py
tests/lora/test_phi.py
+1
-9
tests/lora/test_worker.py
tests/lora/test_worker.py
+10
-8
No files found.
tests/lora/conftest.py
View file @
95a6568b
...
@@ -164,11 +164,6 @@ def mixtral_lora_files():
...
@@ -164,11 +164,6 @@ def mixtral_lora_files():
return
snapshot_download
(
repo_id
=
"SangBinCho/mixtral-lora"
)
return
snapshot_download
(
repo_id
=
"SangBinCho/mixtral-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
gemma_lora_files
():
return
snapshot_download
(
repo_id
=
"wskwon/gemma-7b-test-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
chatglm3_lora_files
():
def
chatglm3_lora_files
():
return
snapshot_download
(
repo_id
=
"jeeejeee/chatglm3-text2sql-spider"
)
return
snapshot_download
(
repo_id
=
"jeeejeee/chatglm3-text2sql-spider"
)
...
...
tests/lora/test_llama_tp.py
View file @
95a6568b
...
@@ -4,9 +4,6 @@ import subprocess
...
@@ -4,9 +4,6 @@ import subprocess
import
sys
import
sys
from
typing
import
Union
from
typing
import
Union
import
pytest
import
ray
import
vllm
import
vllm
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
...
@@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@
pytest
.
mark
.
skip_v1
@
create_new_process_for_each_test
()
def
test_llama_lora_warmup
(
sql_lora_files
):
"""Test that the LLM initialization works with a warmup LORA path and
is more conservative"""
@
ray
.
remote
(
num_gpus
=
1
)
def
get_num_gpu_blocks_lora
():
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
max_num_seqs
=
16
)
num_gpu_blocks_lora_warmup
=
llm
.
llm_engine
.
cache_config
.
num_gpu_blocks
return
num_gpu_blocks_lora_warmup
@
ray
.
remote
(
num_gpus
=
1
)
def
get_num_gpu_blocks_no_lora
():
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_num_seqs
=
16
)
num_gpu_blocks_no_lora_warmup
=
(
llm
.
llm_engine
.
cache_config
.
num_gpu_blocks
)
return
num_gpu_blocks_no_lora_warmup
num_gpu_blocks_lora_warmup
=
ray
.
get
(
get_num_gpu_blocks_lora
.
remote
())
num_gpu_blocks_no_lora_warmup
=
ray
.
get
(
get_num_gpu_blocks_no_lora
.
remote
())
assert
num_gpu_blocks_lora_warmup
<
num_gpu_blocks_no_lora_warmup
,
(
"The warmup with lora should be more "
"conservative than without lora, therefore the number of "
"memory blocks for the KV cache should be "
"less when using lora than when not using lora"
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_llama_lora_tp4
(
sql_lora_files
):
def
test_llama_lora_tp4
(
sql_lora_files
):
...
...
tests/lora/test_lora_functions.py
View file @
95a6568b
...
@@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
...
@@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_PATH
=
"yard1/llama-2-7b-sql-lora-test"
LORA_MODULE_PATH
=
"yard1/llama-2-7b-sql-lora-test"
LORA_RANK
=
8
LORA_RANK
=
8
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines_lora):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass
def
make_lora_request
(
lora_id
:
int
):
def
make_lora_request
(
lora_id
:
int
):
return
LoRARequest
(
lora_name
=
f
"
{
lora_id
}
"
,
return
LoRARequest
(
lora_name
=
f
"
{
lora_id
}
"
,
...
...
tests/lora/test_phi.py
View file @
95a6568b
...
@@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
...
@@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
PROMPT_TEMPLATE
=
"### Instruct: {sql_prompt}
\n\n
### Context: {context}
\n\n
### Output:"
# noqa: E501
PROMPT_TEMPLATE
=
"### Instruct: {sql_prompt}
\n\n
### Context: {context}
\n\n
### Output:"
# noqa: E501
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines_lora
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
prompts
=
[
PROMPT_TEMPLATE
.
format
(
PROMPT_TEMPLATE
.
format
(
...
@@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
...
@@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
# Skipping for V1 for now as we are hitting,
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
# "Head size 80 is not supported by FlashAttention." error.
@
pytest
.
mark
.
skip
_v1
@
pytest
.
mark
.
skip
(
reason
=
"Head size 80 is not supported by FlashAttention"
)
def
test_phi2_lora
(
phi2_lora_files
):
def
test_phi2_lora
(
phi2_lora_files
):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
# Otherwise, the lora-test will fail due to CUDA OOM.
...
...
tests/lora/test_worker.py
View file @
95a6568b
...
@@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest
...
@@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest
from
vllm.v1.worker.gpu_worker
import
Worker
as
V1Worker
from
vllm.v1.worker.gpu_worker
import
Worker
as
V1Worker
from
vllm.worker.worker
import
Worker
from
vllm.worker.worker
import
Worker
NUM_LORAS
=
16
@
patch
.
dict
(
os
.
environ
,
{
"RANK"
:
"0"
})
@
patch
.
dict
(
os
.
environ
,
{
"RANK"
:
"0"
})
def
test_worker_apply_lora
(
sql_lora_files
):
def
test_worker_apply_lora
(
sql_lora_files
):
...
@@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
device_config
=
DeviceConfig
(
"cuda"
),
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
cache_config
=
CacheConfig
(
block_size
=
16
,
block_size
=
16
,
gpu_memory_utilization
=
1.0
,
swap_space
=
0
,
swap_space
=
0
,
cache_dtype
=
"auto"
,
cache_dtype
=
"auto"
,
),
),
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
32
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
32
),
max_cpu_loras
=
NUM_LORAS
,
max_loras
=
NUM_LORAS
),
)
)
worker
=
worker_cls
(
worker
=
worker_cls
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
...
@@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
set_active_loras
(
worker
,
[])
set_active_loras
(
worker
,
[])
assert
worker
.
list_loras
()
==
set
()
assert
worker
.
list_loras
()
==
set
()
n_loras
=
32
lora_requests
=
[
lora_requests
=
[
LoRARequest
(
str
(
i
+
1
),
i
+
1
,
sql_lora_files
)
for
i
in
range
(
n_loras
)
LoRARequest
(
str
(
i
+
1
),
i
+
1
,
sql_lora_files
)
for
i
in
range
(
NUM_LORAS
)
]
]
set_active_loras
(
worker
,
lora_requests
)
set_active_loras
(
worker
,
lora_requests
)
...
@@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
for
lora_request
in
lora_requests
for
lora_request
in
lora_requests
}
}
for
i
in
range
(
32
):
for
i
in
range
(
NUM_LORAS
):
random
.
seed
(
i
)
random
.
seed
(
i
)
iter_lora_requests
=
random
.
choices
(
lora_requests
,
iter_lora_requests
=
random
.
choices
(
lora_requests
,
k
=
random
.
randint
(
1
,
n_loras
))
k
=
random
.
randint
(
1
,
NUM_LORAS
))
random
.
shuffle
(
iter_lora_requests
)
random
.
shuffle
(
iter_lora_requests
)
iter_lora_requests
=
iter_lora_requests
[:
-
random
.
randint
(
0
,
n_loras
)]
iter_lora_requests
=
iter_lora_requests
[:
-
random
.
randint
(
0
,
NUM_LORAS
)]
set_active_loras
(
worker
,
lora_requests
)
set_active_loras
(
worker
,
lora_requests
)
assert
worker
.
list_loras
().
issuperset
(
assert
worker
.
list_loras
().
issuperset
(
{
lora_request
.
lora_int_id
{
lora_request
.
lora_int_id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment