[CI/Build] Split up VLM tests (#11083)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Split up VLM tests (#11083)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
d1e21a97 · Cyrus Leung · GitHub · 72ff3a96 · d1e21a97 · d1e21a97
Unverified Commit d1e21a97 authored Dec 12, 2024 by Cyrus Leung Committed by GitHub Dec 12, 2024
4 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -321,7 +321,7 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 30min
+- label: Basic Models Test # 24min
  source_file_dependencies:
  - vllm/
  - tests/models
@@ -331,7 +331,7 @@ steps:
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard) # 42min
+- label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
@@ -342,7 +342,7 @@ steps:
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model

- label: Language Models Test (Extended) # 50min
+- label: Language Models Test (Extended) # 1h10min
  optional: true
  source_file_dependencies:
  - vllm/
@@ -353,7 +353,7 @@ steps:
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 26min
+- label: Multi-Modal Models Test (Standard) # 28min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
@@ -369,7 +369,7 @@ steps:
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Multi-Modal Models Test (Extended) # 1h15m
+- label: Multi-Modal Models Test (Extended) 1 # 1h16m
  optional: true
  source_file_dependencies:
  - vllm/
@@ -380,14 +380,24 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
    # HACK - run phi3v tests separately to sidestep this transformers bug
    # https://github.com/huggingface/transformers/issues/34307
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'

+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
  optional: true
@@ -446,11 +456,11 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
@@ -540,7 +550,7 @@ steps:
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

 - label: LM Eval Large Models # optional

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,7 +96,8 @@ markers = [
    "core_model: enable this model test in each PR instead of only nightly",
    "cpu_model: enable this model test in CPU tests",
    "quant_model: run this model test under Quantized category",
-    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
    "skip_v1: do not run this test with v1",
    "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+import math
 import os
+from collections import defaultdict
 from pathlib import PosixPath
 from typing import Type

@@ -10,11 +12,12 @@ from transformers import AutoModelForVision2Seq
 from transformers.utils import is_flash_attn_2_available

 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, identity
+from vllm.utils import identity

 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                          _VideoAssets)
-from ....utils import fork_new_process_for_each_test, large_gpu_mark
+from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
@@ -382,7 +385,7 @@ VLM_TEST_SETTINGS = {
        prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
    ),
    ### Tensor parallel / multi-gpu broadcast tests
-    "broadcast-chameleon": VLMTestInfo(
+    "chameleon-broadcast": VLMTestInfo(
        models=["facebook/chameleon-7b"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
@@ -393,43 +396,25 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
        hf_output_post_proc = lambda hf_output, model: hf_output[:2],
        comparator=check_outputs_equal,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            ),
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
-    "broadcast-llava": VLMTestInfo(
+    "llava-broadcast": VLMTestInfo(
        models=["llava-hf/llava-1.5-7b-hf"],
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        max_model_len=4096,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
-    "broadcast-llava_next": VLMTestInfo(
+    "llava_next-broadcast": VLMTestInfo(
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
        prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
        max_model_len=10240,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
-        marks=[
-            pytest.mark.distributed_2_gpus,
-            pytest.mark.skipif(
-                cuda_device_count_stateless() < 2,
-                reason="Need at least 2 GPUs to run the test.",
-            )
-        ],
+        marks=multi_gpu_marks(num_gpus=2),
        **COMMON_BROADCAST_SETTINGS # type: ignore
    ),
    ### Custom input edge-cases for specific models
@@ -468,6 +453,41 @@ VLM_TEST_SETTINGS = {
 # yapf: enable


+def _mark_splits(
+    test_settings: dict[str, VLMTestInfo],
+    *,
+    num_groups: int,
+) -> dict[str, VLMTestInfo]:
+    name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
+    test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
+
+    for info in test_settings.values():
+        for model in info.models:
+            test_infos_by_model[model].append(info)
+
+    models = sorted(test_infos_by_model.keys())
+    split_size = math.ceil(len(models) / num_groups)
+
+    new_test_settings = dict[str, VLMTestInfo]()
+
+    for i in range(num_groups):
+        models_in_group = models[i * split_size:(i + 1) * split_size]
+
+        for model in models_in_group:
+            for info in test_infos_by_model[model]:
+                new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
+                new_info = info._replace(marks=new_marks)
+                new_test_settings[name_by_test_info_id[id(info)]] = new_info
+
+    missing_keys = test_settings.keys() - new_test_settings.keys()
+    assert not missing_keys, f"Missing keys: {missing_keys}"
+
+    return new_test_settings
+
+
+VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
+
+
 ### Test wrappers
 # Wrappers around the core test running func for:
 # - single image

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -682,10 +682,12 @@ def fork_new_process_for_each_test(


 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
-    """Gets a pytest skipif mark, which triggers ig the the device doesn't have
-    meet a minimum memory requirement in gb; can be leveraged via 
-    @large_gpu_test to skip tests in environments without enough resources, or
-    called when filtering tests to run directly.
+    """
+    Get a pytest mark, which skips the test if the GPU doesn't meet
+    a minimum memory requirement in GB.
+    
+    This can be leveraged via `@large_gpu_test` to skip tests in environments
+    without enough resources, or called when filtering tests to run directly.
    """
    try:
        if current_platform.is_cpu():
@@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):

    Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
    """
-    test_skipif = large_gpu_mark(min_gb)
+    mark = large_gpu_mark(min_gb)

    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_skipif(f)
+        return mark(f)

    return wrapper


-def multi_gpu_test(*, num_gpus: int):
-    """
-    Decorate a test to be run only when multiple GPUs are available.
-    """
-    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+def multi_gpu_marks(*, num_gpus: int):
+    """Get a collection of pytest marks to apply for `@multi_gpu_test`."""
+    test_selector = pytest.mark.distributed(num_gpus=num_gpus)
    test_skipif = pytest.mark.skipif(
        cuda_device_count_stateless() < num_gpus,
        reason=f"Need at least {num_gpus} GPUs to run the test.",
    )

+    return [test_selector, test_skipif]
+
+
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    marks = multi_gpu_marks(num_gpus=num_gpus)
+
    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+        func = fork_new_process_for_each_test(f)
+        for mark in reversed(marks):
+            func = mark(func)
+
+        return func

    return wrapper