[CI/Build] Reorganize models tests (#7820)

a84e598e · Cyrus Leung · GitHub · 0a4806f0 · a84e598e · a84e598e
Unverified Commit a84e598e authored Sep 14, 2024 by Cyrus Leung Committed by GitHub Sep 13, 2024
14 changed files
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype, max_tokens, num_logprobs) -> None:
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.

--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs
-from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 _PREFACE = (
    "A chat between a curious human and an artificial intelligence assistant. "

--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -65,7 +63,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.

--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -69,7 +67,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.

--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -71,7 +69,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.

--- a/tests/models/test_pixtral.py
+++ b/tests/models/test_pixtral.py
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
 import json
 import uuid
 from dataclasses import asdict
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import pytest
 from mistral_common.protocol.instruct.messages import ImageURLChunk
@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.sequence import Logprob, SampleLogprobs
-from .utils import check_logprobs_close
+from ....utils import VLLM_PATH
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
+if TYPE_CHECKING:
+    from _typeshed import StrPath
 MODELS = ["mistralai/Pixtral-12B-2409"]
 IMG_URLS = [
@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 MAX_MODEL_LEN = [8192, 65536]
-FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json"
-FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json"
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
 OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
 # For the test author to store golden output in JSON
-def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
    json_data = [(tokens, text,
                  [{k: asdict(v)
                    for k, v in token_logprobs.items()}
@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
        json.dump(json_data, f)
-def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs:
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)

--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
-from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
                          VllmRunner, _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 text_only_models = [
    "Qwen/Qwen-7B-Chat"  # Has no visual component

--- a/tests/models/embedding/__init__.py
+++ b/tests/models/embedding/__init__.py
--- a/tests/models/embedding/language/__init__.py
+++ b/tests/models/embedding/language/__init__.py
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
--- a/tests/models/encoder_decoder/__init__.py
+++ b/tests/models/encoder_decoder/__init__.py
--- a/tests/models/encoder_decoder/language/__init__.py
+++ b/tests/models/encoder_decoder/language/__init__.py
--- a/tests/models/test_bart.py
+++ b/tests/models/test_bart.py
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
-Run `pytest tests/models/test_bart.py`.
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 from vllm.utils import is_cpu
@@ -16,8 +16,10 @@ if not is_cpu():
    from vllm.sequence import SampleLogprobs
-    from ..conftest import DecoderPromptType
+    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-    from .utils import check_logprobs_close
+                              HfRunner, VllmRunner)
+    from ....utils import multi_gpu_test
+    from ...utils import check_logprobs_close
    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
@@ -34,20 +36,18 @@ if not is_cpu():
        return output_ids, hf_output_str, out_logprobs
-    @pytest.mark.parametrize("model", MODELS)
+    def run_test(
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+        hf_runner: Type[HfRunner],
-    @pytest.mark.parametrize("max_tokens", [64])
+        vllm_runner: Type[VllmRunner],
-    @pytest.mark.parametrize("num_logprobs", [5])
+        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+        decoder_prompt_type: DecoderPromptType,
-    def test_models(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts,
        model: str,
+        *,
        dtype: str,
        max_tokens: int,
        num_logprobs: int,
-        decoder_prompt_type: DecoderPromptType,
+        tensor_parallel_size: int,
+        distributed_executor_backend: Optional[str] = None,
    ) -> None:
        '''
        Test the vLLM BART model for a variety of encoder/decoder input prompts,
@@ -116,8 +116,29 @@ if not is_cpu():
        token during the process of validating the vLLM decoded output.
        '''
-        test_case_prompts = example_encoder_decoder_prompts[
+        # NOTE: take care of the order. run vLLM first, and then run HF.
-            decoder_prompt_type]
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method (the default).
+        # Note: currently encoder/decoder models are only compatible with
+        # enforce_eager=True. Normally this is not a problem because
+        # for encoder/decoder models vLLM will
+        # default to enforce_eager=True if enforce_eager
+        # is left unspecified. However, the
+        # VllmRunner test fixture (which wraps around the LLM class) defaults to
+        # enforce_eager=False (a behavior which a number of already-exisitng
+        # decoder-only unit tests expect), so when testing an encoder/decoder
+        # model we must explicitly specify enforce_eager=True in the VllmRunner
+        # constructor.
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallel_size,
+                distributed_executor_backend=distributed_executor_backend,
+                enforce_eager=True) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
        # Configuration settings for HF baseline
        hf_kwargs = {
@@ -135,26 +156,12 @@ if not is_cpu():
                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
            hf_outputs = (
                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
+                    prompts,
                    max_tokens,
                    num_logprobs,
                    **hf_kwargs,
                ))
-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models vLLM will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=False (a behavior which a number of already-exisitng
-        # decoder-only unit tests expect), so when testing an encoder/decoder
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # constructor.
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                test_case_prompts, max_tokens, num_logprobs)
        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
                          else 0)
@@ -168,3 +175,49 @@ if not is_cpu():
            name_1="vllm",
            num_outputs_0_skip_tokens=hf_skip_tokens,
        )
+    @pytest.mark.parametrize("model", MODELS)
+    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
+                    model, dtype, max_tokens, num_logprobs,
+                    decoder_prompt_type) -> None:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+    @multi_gpu_test(num_gpus=2)
+    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+    @pytest.mark.parametrize("dtype", ["float"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+    def test_models_distributed(hf_runner, vllm_runner,
+                                example_encoder_decoder_prompts,
+                                distributed_executor_backend, model, dtype,
+                                max_tokens, num_logprobs,
+                                decoder_prompt_type) -> None:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+        )
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional
 import openai
+import pytest
 import requests
 from openai.types.completion import Completion
 from transformers import AutoTokenizer
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
+from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
+                        get_open_port, is_hip)
 if current_platform.is_rocm():
    from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
    return wrapper
+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+    test_skipif = pytest.mark.skipif(
+        cuda_device_count_stateless() < num_gpus,
+        reason=f"Need at least {num_gpus} GPUs to run the test.",
+    )
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+    return wrapper
 async def completions_with_server_args(
    prompts: List[str],
    model_name: str,