Unverified Commit a84e598e authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Reorganize models tests (#7820)

parent 0a4806f0
......@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 4
......@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype, max_tokens, num_logprobs) -> None:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
......
......@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.sequence import SampleLogprobs
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
from ...utils import check_logprobs_close
_PREFACE = (
"A chat between a curious human and an artificial intelligence assistant. "
......
......@@ -9,10 +9,8 @@ from transformers import BatchEncoding
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
from ...utils import check_logprobs_close
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
......@@ -65,7 +63,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
......
......@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from vllm.utils import is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -69,7 +67,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
......
......@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -71,7 +69,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
......
......@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
import json
import uuid
from dataclasses import asdict
from typing import Any, Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
import pytest
from mistral_common.protocol.instruct.messages import ImageURLChunk
......@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from vllm.multimodal import MultiModalDataBuiltins
from vllm.sequence import Logprob, SampleLogprobs
from .utils import check_logprobs_close
from ....utils import VLLM_PATH
from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
if TYPE_CHECKING:
from _typeshed import StrPath
MODELS = ["mistralai/Pixtral-12B-2409"]
IMG_URLS = [
......@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
MAX_MODEL_LEN = [8192, 65536]
FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json"
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
# For the test author to store golden output in JSON
def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text,
[{k: asdict(v)
for k, v in token_logprobs.items()}
......@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
json.dump(json_data, f)
def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs:
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
......
......@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
VllmRunner, _ImageAssets)
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
VllmRunner, _ImageAssets)
from ...utils import check_logprobs_close
text_only_models = [
"Qwen/Qwen-7B-Chat" # Has no visual component
......
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/test_bart.py`.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
from typing import List, Optional, Tuple
from typing import List, Optional, Tuple, Type
from vllm.utils import is_cpu
......@@ -16,8 +16,10 @@ if not is_cpu():
from vllm.sequence import SampleLogprobs
from ..conftest import DecoderPromptType
from .utils import check_logprobs_close
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
HfRunner, VllmRunner)
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
......@@ -34,20 +36,18 @@ if not is_cpu():
return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
def test_models(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
decoder_prompt_type: DecoderPromptType,
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
decoder_prompt_type: DecoderPromptType,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
) -> None:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
......@@ -116,8 +116,29 @@ if not is_cpu():
token during the process of validating the vLLM decoded output.
'''
test_case_prompts = example_encoder_decoder_prompts[
decoder_prompt_type]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
prompts, max_tokens, num_logprobs)
# Configuration settings for HF baseline
hf_kwargs = {
......@@ -135,26 +156,12 @@ if not is_cpu():
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = (
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_case_prompts,
prompts,
max_tokens,
num_logprobs,
**hf_kwargs,
))
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_case_prompts, max_tokens, num_logprobs)
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
else 0)
......@@ -168,3 +175,49 @@ if not is_cpu():
name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens,
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
model, dtype, max_tokens, num_logprobs,
decoder_prompt_type) -> None:
run_test(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts[decoder_prompt_type],
decoder_prompt_type,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
def test_models_distributed(hf_runner, vllm_runner,
example_encoder_decoder_prompts,
distributed_executor_backend, model, dtype,
max_tokens, num_logprobs,
decoder_prompt_type) -> None:
run_test(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts[decoder_prompt_type],
decoder_prompt_type,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
......@@ -10,6 +10,7 @@ from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
import openai
import pytest
import requests
from openai.types.completion import Completion
from transformers import AutoTokenizer
......@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
get_open_port, is_hip)
if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage,
......@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
return wrapper
def multi_gpu_test(*, num_gpus: int):
"""
Decorate a test to be run only when multiple GPUs are available.
"""
test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
test_skipif = pytest.mark.skipif(
cuda_device_count_stateless() < num_gpus,
reason=f"Need at least {num_gpus} GPUs to run the test.",
)
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
return test_selector(test_skipif(fork_new_process_for_each_test(f)))
return wrapper
async def completions_with_server_args(
prompts: List[str],
model_name: str,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment