Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d1e21a97
Unverified
Commit
d1e21a97
authored
Dec 12, 2024
by
Cyrus Leung
Committed by
GitHub
Dec 12, 2024
Browse files
[CI/Build] Split up VLM tests (#11083)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
72ff3a96
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
94 additions
and
50 deletions
+94
-50
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+21
-11
pyproject.toml
pyproject.toml
+2
-1
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+46
-26
tests/utils.py
tests/utils.py
+25
-12
No files found.
.buildkite/test-pipeline.yaml
View file @
d1e21a97
...
@@ -321,7 +321,7 @@ steps:
...
@@ -321,7 +321,7 @@ steps:
##### models test #####
##### models test #####
-
label
:
Basic Models Test
#
30
min
-
label
:
Basic Models Test
#
24
min
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/models
-
tests/models
...
@@ -331,7 +331,7 @@ steps:
...
@@ -331,7 +331,7 @@ steps:
-
pytest -v -s models/test_registry.py
-
pytest -v -s models/test_registry.py
-
pytest -v -s models/test_initialization.py
-
pytest -v -s models/test_initialization.py
-
label
:
Language Models Test (Standard)
#
4
2min
-
label
:
Language Models Test (Standard)
#
3
2min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -342,7 +342,7 @@ steps:
...
@@ -342,7 +342,7 @@ steps:
-
pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-
pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-
pytest -v -s models/embedding/language -m core_model
-
pytest -v -s models/embedding/language -m core_model
-
label
:
Language Models Test (Extended)
#
5
0min
-
label
:
Language Models Test (Extended)
#
1h1
0min
optional
:
true
optional
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -353,7 +353,7 @@ steps:
...
@@ -353,7 +353,7 @@ steps:
-
pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-
pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-
pytest -v -s models/embedding/language -m 'not core_model'
-
pytest -v -s models/embedding/language -m 'not core_model'
-
label
:
Multi-Modal Models Test (Standard)
# 2
6
min
-
label
:
Multi-Modal Models Test (Standard)
# 2
8
min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -369,7 +369,7 @@ steps:
...
@@ -369,7 +369,7 @@ steps:
-
pytest -v -s models/encoder_decoder/language -m core_model
-
pytest -v -s models/encoder_decoder/language -m core_model
-
pytest -v -s models/encoder_decoder/vision_language -m core_model
-
pytest -v -s models/encoder_decoder/vision_language -m core_model
-
label
:
Multi-Modal Models Test (Extended)
# 1h1
5
m
-
label
:
Multi-Modal Models Test (Extended)
1
# 1h1
6
m
optional
:
true
optional
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -380,14 +380,24 @@ steps:
...
@@ -380,14 +380,24 @@ steps:
commands
:
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-
pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-
pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
# https://github.com/huggingface/transformers/issues/34307
-
pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-
pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-
pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-
pytest -v -s --ignore
models/decoder_only/vision_language/test_models.py --ignore
models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-
pytest -v -s models/embedding/vision_language -m 'not core_model'
-
pytest -v -s models/embedding/vision_language -m 'not core_model'
-
pytest -v -s models/encoder_decoder/language -m 'not core_model'
-
pytest -v -s models/encoder_decoder/language -m 'not core_model'
-
pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
label
:
Multi-Modal Models Test (Extended)
2
# 38m
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/decoder_only/vision_language
commands
:
-
pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-
pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
# This test is used only in PR development phase to test individual models and should never run on main
# This test is used only in PR development phase to test individual models and should never run on main
-
label
:
Custom Models Test
-
label
:
Custom Models Test
optional
:
true
optional
:
true
...
@@ -446,11 +456,11 @@ steps:
...
@@ -446,11 +456,11 @@ steps:
-
pytest -v -s ./compile/test_basic_correctness.py
-
pytest -v -s ./compile/test_basic_correctness.py
-
pytest -v -s ./compile/test_wrapper.py
-
pytest -v -s ./compile/test_wrapper.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-
TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed
_2
_gpus
-
TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m
'
distributed
(num
_gpus
=2)'
# Avoid importing model tests that cause CUDA reinitialization error
# Avoid importing model tests that cause CUDA reinitialization error
-
pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed
_2
_gpus
-
pytest models/encoder_decoder/language/test_bart.py -v -s -m
'
distributed
(num
_gpus
=2)'
-
pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed
_2
_gpus
-
pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m
'
distributed
(num
_gpus
=2)'
-
pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed
_2
_gpus
-
pytest models/decoder_only/vision_language/test_models.py -v -s -m
'
distributed
(num
_gpus
=2)'
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
pip install -e ./plugins/vllm_add_dummy_model
-
pip install -e ./plugins/vllm_add_dummy_model
-
pytest -v -s distributed/test_distributed_oot.py
-
pytest -v -s distributed/test_distributed_oot.py
...
@@ -540,7 +550,7 @@ steps:
...
@@ -540,7 +550,7 @@ steps:
# see https://github.com/vllm-project/vllm/pull/5689 for details
# see https://github.com/vllm-project/vllm/pull/5689 for details
-
pytest -v -s distributed/test_custom_all_reduce.py
-
pytest -v -s distributed/test_custom_all_reduce.py
-
torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-
torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed
_2
_gpus
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m
'
distributed
(num
_gpus
=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
LM Eval Large Models
# optional
-
label
:
LM Eval Large Models
# optional
...
...
pyproject.toml
View file @
d1e21a97
...
@@ -96,7 +96,8 @@ markers = [
...
@@ -96,7 +96,8 @@ markers = [
"core_model: enable this model test in each PR instead of only nightly"
,
"core_model: enable this model test in each PR instead of only nightly"
,
"cpu_model: enable this model test in CPU tests"
,
"cpu_model: enable this model test in CPU tests"
,
"quant_model: run this model test under Quantized category"
,
"quant_model: run this model test under Quantized category"
,
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs"
,
"split: run this test as part of a split"
,
"distributed: run this test only in distributed GPU tests"
,
"skip_v1: do not run this test with v1"
,
"skip_v1: do not run this test with v1"
,
"optional: optional tests that are automatically skipped, include --optional to run them"
,
"optional: optional tests that are automatically skipped, include --optional to run them"
,
]
]
tests/models/decoder_only/vision_language/test_models.py
View file @
d1e21a97
"""Common tests for testing .generate() functionality for single / multiple
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
image, embedding, and video support for different VLMs in vLLM.
"""
"""
import
math
import
os
import
os
from
collections
import
defaultdict
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Type
from
typing
import
Type
...
@@ -10,11 +12,12 @@ from transformers import AutoModelForVision2Seq
...
@@ -10,11 +12,12 @@ from transformers import AutoModelForVision2Seq
from
transformers.utils
import
is_flash_attn_2_available
from
transformers.utils
import
is_flash_attn_2_available
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
cuda_device_count_stateless
,
identity
from
vllm.utils
import
identity
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
,
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
,
_VideoAssets
)
_VideoAssets
)
from
....utils
import
fork_new_process_for_each_test
,
large_gpu_mark
from
....utils
import
(
fork_new_process_for_each_test
,
large_gpu_mark
,
multi_gpu_marks
)
from
...utils
import
check_outputs_equal
from
...utils
import
check_outputs_equal
from
.vlm_utils
import
custom_inputs
,
model_utils
,
runners
from
.vlm_utils
import
custom_inputs
,
model_utils
,
runners
from
.vlm_utils.case_filtering
import
get_parametrized_options
from
.vlm_utils.case_filtering
import
get_parametrized_options
...
@@ -382,7 +385,7 @@ VLM_TEST_SETTINGS = {
...
@@ -382,7 +385,7 @@ VLM_TEST_SETTINGS = {
prompt_path_encoder
=
model_utils
.
qwen_prompt_path_encoder
,
prompt_path_encoder
=
model_utils
.
qwen_prompt_path_encoder
,
),
),
### Tensor parallel / multi-gpu broadcast tests
### Tensor parallel / multi-gpu broadcast tests
"
broadcast-chameleon
"
:
VLMTestInfo
(
"
chameleon-broadcast
"
:
VLMTestInfo
(
models
=
[
"facebook/chameleon-7b"
],
models
=
[
"facebook/chameleon-7b"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
...
@@ -393,43 +396,25 @@ VLM_TEST_SETTINGS = {
...
@@ -393,43 +396,25 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
vllm_output_post_proc
=
lambda
vllm_output
,
model
:
vllm_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
hf_output_post_proc
=
lambda
hf_output
,
model
:
hf_output
[:
2
],
comparator
=
check_outputs_equal
,
comparator
=
check_outputs_equal
,
marks
=
[
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
),
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
),
"broadcast
-llava
"
:
VLMTestInfo
(
"
llava-
broadcast"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
models
=
[
"llava-hf/llava-1.5-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
prompt_formatter
=
lambda
img_prompt
:
f
"USER:
{
img_prompt
}
\n
ASSISTANT:"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
[
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
)
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
),
"
broadcast-llava_nex
t"
:
VLMTestInfo
(
"
llava_next-broadcas
t"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
models
=
[
"llava-hf/llava-v1.6-mistral-7b-hf"
],
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
prompt_formatter
=
lambda
img_prompt
:
f
"[INST]
{
img_prompt
}
[/INST]"
,
max_model_len
=
10240
,
max_model_len
=
10240
,
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
llava_image_vllm_to_hf_output
,
marks
=
[
marks
=
multi_gpu_marks
(
num_gpus
=
2
),
pytest
.
mark
.
distributed_2_gpus
,
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
,
)
],
**
COMMON_BROADCAST_SETTINGS
# type: ignore
**
COMMON_BROADCAST_SETTINGS
# type: ignore
),
),
### Custom input edge-cases for specific models
### Custom input edge-cases for specific models
...
@@ -468,6 +453,41 @@ VLM_TEST_SETTINGS = {
...
@@ -468,6 +453,41 @@ VLM_TEST_SETTINGS = {
# yapf: enable
# yapf: enable
def
_mark_splits
(
test_settings
:
dict
[
str
,
VLMTestInfo
],
*
,
num_groups
:
int
,
)
->
dict
[
str
,
VLMTestInfo
]:
name_by_test_info_id
=
{
id
(
v
):
k
for
k
,
v
in
test_settings
.
items
()}
test_infos_by_model
=
defaultdict
[
str
,
list
[
VLMTestInfo
]](
list
)
for
info
in
test_settings
.
values
():
for
model
in
info
.
models
:
test_infos_by_model
[
model
].
append
(
info
)
models
=
sorted
(
test_infos_by_model
.
keys
())
split_size
=
math
.
ceil
(
len
(
models
)
/
num_groups
)
new_test_settings
=
dict
[
str
,
VLMTestInfo
]()
for
i
in
range
(
num_groups
):
models_in_group
=
models
[
i
*
split_size
:(
i
+
1
)
*
split_size
]
for
model
in
models_in_group
:
for
info
in
test_infos_by_model
[
model
]:
new_marks
=
(
info
.
marks
or
[])
+
[
pytest
.
mark
.
split
(
group
=
i
)]
new_info
=
info
.
_replace
(
marks
=
new_marks
)
new_test_settings
[
name_by_test_info_id
[
id
(
info
)]]
=
new_info
missing_keys
=
test_settings
.
keys
()
-
new_test_settings
.
keys
()
assert
not
missing_keys
,
f
"Missing keys:
{
missing_keys
}
"
return
new_test_settings
VLM_TEST_SETTINGS
=
_mark_splits
(
VLM_TEST_SETTINGS
,
num_groups
=
2
)
### Test wrappers
### Test wrappers
# Wrappers around the core test running func for:
# Wrappers around the core test running func for:
# - single image
# - single image
...
...
tests/utils.py
View file @
d1e21a97
...
@@ -682,10 +682,12 @@ def fork_new_process_for_each_test(
...
@@ -682,10 +682,12 @@ def fork_new_process_for_each_test(
def
large_gpu_mark
(
min_gb
:
int
)
->
pytest
.
MarkDecorator
:
def
large_gpu_mark
(
min_gb
:
int
)
->
pytest
.
MarkDecorator
:
"""Gets a pytest skipif mark, which triggers ig the the device doesn't have
"""
meet a minimum memory requirement in gb; can be leveraged via
Get a pytest mark, which skips the test if the GPU doesn't meet
@large_gpu_test to skip tests in environments without enough resources, or
a minimum memory requirement in GB.
called when filtering tests to run directly.
This can be leveraged via `@large_gpu_test` to skip tests in environments
without enough resources, or called when filtering tests to run directly.
"""
"""
try
:
try
:
if
current_platform
.
is_cpu
():
if
current_platform
.
is_cpu
():
...
@@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):
...
@@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
"""
"""
test_skipif
=
large_gpu_mark
(
min_gb
)
mark
=
large_gpu_mark
(
min_gb
)
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
return
test_skipif
(
f
)
return
mark
(
f
)
return
wrapper
return
wrapper
def
multi_gpu_test
(
*
,
num_gpus
:
int
):
def
multi_gpu_marks
(
*
,
num_gpus
:
int
):
"""
"""Get a collection of pytest marks to apply for `@multi_gpu_test`."""
Decorate a test to be run only when multiple GPUs are available.
test_selector
=
pytest
.
mark
.
distributed
(
num_gpus
=
num_gpus
)
"""
test_selector
=
getattr
(
pytest
.
mark
,
f
"distributed_
{
num_gpus
}
_gpus"
)
test_skipif
=
pytest
.
mark
.
skipif
(
test_skipif
=
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
num_gpus
,
cuda_device_count_stateless
()
<
num_gpus
,
reason
=
f
"Need at least
{
num_gpus
}
GPUs to run the test."
,
reason
=
f
"Need at least
{
num_gpus
}
GPUs to run the test."
,
)
)
return
[
test_selector
,
test_skipif
]
def
multi_gpu_test
(
*
,
num_gpus
:
int
):
"""
Decorate a test to be run only when multiple GPUs are available.
"""
marks
=
multi_gpu_marks
(
num_gpus
=
num_gpus
)
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
return
test_selector
(
test_skipif
(
fork_new_process_for_each_test
(
f
)))
func
=
fork_new_process_for_each_test
(
f
)
for
mark
in
reversed
(
marks
):
func
=
mark
(
func
)
return
func
return
wrapper
return
wrapper
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment