Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
86c3369e
Unverified
Commit
86c3369e
authored
Apr 09, 2025
by
Jee Jee Li
Committed by
GitHub
Apr 09, 2025
Browse files
[CI/Build] Fix CI LoRA failure (#16270)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
2755c34a
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
19 additions
and
13 deletions
+19
-13
tests/lora/conftest.py
tests/lora/conftest.py
+12
-0
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+0
-1
tests/lora/test_chatglm3_tp.py
tests/lora/test_chatglm3_tp.py
+0
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+1
-1
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+0
-1
tests/lora/test_punica_ops.py
tests/lora/test_punica_ops.py
+5
-0
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+1
-8
tests/lora/test_transfomers_model.py
tests/lora/test_transfomers_model.py
+0
-1
No files found.
tests/lora/conftest.py
View file @
86c3369e
...
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
...
@@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
yield
@
pytest
.
fixture
def
reset_default_device
():
"""
Some tests, such as `test_punica_ops.py`, explicitly set the
default device, which can affect subsequent tests. Adding this fixture
helps avoid this problem.
"""
original_device
=
torch
.
get_default_device
()
yield
torch
.
set_default_device
(
original_device
)
tests/lora/test_baichuan.py
View file @
86c3369e
...
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
...
@@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
fully_sharded_loras
=
fully_sharded
)
fully_sharded_loras
=
fully_sharded
)
output_tp1
=
do_sample
(
llm_tp1
,
baichuan_lora_files
,
lora_id
=
1
)
output_tp1
=
do_sample
(
llm_tp1
,
baichuan_lora_files
,
lora_id
=
1
)
...
...
tests/lora/test_chatglm3_tp.py
View file @
86c3369e
...
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
...
@@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_lora_rank
=
64
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
...
tests/lora/test_layers.py
View file @
86c3369e
...
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
...
@@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
clean_cache
(
):
def
clean_cache
_reset_device
(
reset_default_device
):
# Release any memory we might be holding on to. CI runs OOMs otherwise.
# Release any memory we might be holding on to. CI runs OOMs otherwise.
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
from
vllm.lora.ops.triton_ops.utils
import
(
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
)
_LORA_B_PTR_DICT
)
...
...
tests/lora/test_llama_tp.py
View file @
86c3369e
...
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
...
@@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
# also test odd max_num_seqs
# also test odd max_num_seqs
max_num_seqs
=
13
,
max_num_seqs
=
13
,
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
generate_and_test
(
llm
,
sql_lora_files
)
generate_and_test
(
llm
,
sql_lora_files
)
...
...
tests/lora/test_punica_ops.py
View file @
86c3369e
...
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
...
@@ -13,6 +13,11 @@ from vllm.platforms import current_platform
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
@
pytest
.
fixture
(
autouse
=
True
)
def
reset_device
(
reset_default_device
):
pass
# Utility shrink and expand operations used as reference implementations.
# Utility shrink and expand operations used as reference implementations.
def
sgmv_shrink_for_nslices
(
def
sgmv_shrink_for_nslices
(
nslices
:
int
,
inputs_tensor
:
torch
.
Tensor
,
nslices
:
int
,
inputs_tensor
:
torch
.
Tensor
,
...
...
tests/lora/test_quant_model.py
View file @
86c3369e
...
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
...
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_quant_model_lora
(
tinyllama_lora_files
,
model
):
def
test_quant_model_lora
(
tinyllama_lora_files
,
num_gpus_available
,
model
,
tp_size
):
if
num_gpus_available
<
tp_size
and
\
tp_size
>
1
and
current_platform
.
is_cuda_alike
():
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
llm
=
vllm
.
LLM
(
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
model
=
model
.
model_path
,
...
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
...
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
max_model_len
=
400
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
...
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
enable_lora
=
True
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
...
tests/lora/test_transfomers_model.py
View file @
86c3369e
...
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
...
@@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
enable_lora
=
True
,
enable_lora
=
True
,
max_loras
=
4
,
max_loras
=
4
,
max_lora_rank
=
16
,
max_lora_rank
=
16
,
tensor_parallel_size
=
1
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment