Unverified Commit a84e598e authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Reorganize models tests (#7820)

parent 0a4806f0
...@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer ...@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
_LIMIT_IMAGE_PER_PROMPT = 4 _LIMIT_IMAGE_PER_PROMPT = 4
...@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ...@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype, max_tokens, num_logprobs) -> None: dtype, max_tokens, num_logprobs) -> None:
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
......
...@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video, ...@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
sample_frames_from_video) sample_frames_from_video)
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
_PREFACE = ( _PREFACE = (
"A chat between a curious human and an artificial intelligence assistant. " "A chat between a curious human and an artificial intelligence assistant. "
......
...@@ -9,10 +9,8 @@ from transformers import BatchEncoding ...@@ -9,10 +9,8 @@ from transformers import BatchEncoding
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
# The image token is placed before "user" on purpose so that the test can pass # The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
...@@ -65,7 +63,7 @@ def run_test( ...@@ -65,7 +63,7 @@ def run_test(
): ):
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
......
...@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size ...@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_hip from vllm.utils import is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -69,7 +67,7 @@ def run_test( ...@@ -69,7 +67,7 @@ def run_test(
): ):
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
......
...@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size ...@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip from vllm.utils import is_cpu, is_hip
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -71,7 +69,7 @@ def run_test( ...@@ -71,7 +69,7 @@ def run_test(
): ):
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
......
...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`. ...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
import json import json
import uuid import uuid
from dataclasses import asdict from dataclasses import asdict
from typing import Any, Dict, List, Optional, Tuple from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
import pytest import pytest
from mistral_common.protocol.instruct.messages import ImageURLChunk from mistral_common.protocol.instruct.messages import ImageURLChunk
...@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt ...@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from vllm.multimodal import MultiModalDataBuiltins from vllm.multimodal import MultiModalDataBuiltins
from vllm.sequence import Logprob, SampleLogprobs from vllm.sequence import Logprob, SampleLogprobs
from .utils import check_logprobs_close from ....utils import VLLM_PATH
from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm if TYPE_CHECKING:
from _typeshed import StrPath
MODELS = ["mistralai/Pixtral-12B-2409"] MODELS = ["mistralai/Pixtral-12B-2409"]
IMG_URLS = [ IMG_URLS = [
...@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) ...@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4) LIMIT_MM_PER_PROMPT = dict(image=4)
MAX_MODEL_LEN = [8192, 65536] MAX_MODEL_LEN = [8192, 65536]
FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json" FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]] OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
# For the test author to store golden output in JSON # For the test author to store golden output in JSON
def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None: def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text, json_data = [(tokens, text,
[{k: asdict(v) [{k: asdict(v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()}
...@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None: ...@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
json.dump(json_data, f) json.dump(json_data, f)
def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs: def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f: with open(filename, "rb") as f:
json_data = json.load(f) json_data = json.load(f)
......
...@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs ...@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput, from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
VllmRunner, _ImageAssets) VllmRunner, _ImageAssets)
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
text_only_models = [ text_only_models = [
"Qwen/Qwen-7B-Chat" # Has no visual component "Qwen/Qwen-7B-Chat" # Has no visual component
......
"""Compare the outputs of HF and vLLM for BART models using greedy sampling. """Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/test_bart.py`. Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
""" """
from typing import List, Optional, Tuple from typing import List, Optional, Tuple, Type
from vllm.utils import is_cpu from vllm.utils import is_cpu
...@@ -16,8 +16,10 @@ if not is_cpu(): ...@@ -16,8 +16,10 @@ if not is_cpu():
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ..conftest import DecoderPromptType from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
from .utils import check_logprobs_close HfRunner, VllmRunner)
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
...@@ -34,20 +36,18 @@ if not is_cpu(): ...@@ -34,20 +36,18 @@ if not is_cpu():
return output_ids, hf_output_str, out_logprobs return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", MODELS) def run_test(
@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) hf_runner: Type[HfRunner],
@pytest.mark.parametrize("max_tokens", [64]) vllm_runner: Type[VllmRunner],
@pytest.mark.parametrize("num_logprobs", [5]) prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) decoder_prompt_type: DecoderPromptType,
def test_models(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
model: str, model: str,
*,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
decoder_prompt_type: DecoderPromptType, tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
) -> None: ) -> None:
''' '''
Test the vLLM BART model for a variety of encoder/decoder input prompts, Test the vLLM BART model for a variety of encoder/decoder input prompts,
...@@ -116,8 +116,29 @@ if not is_cpu(): ...@@ -116,8 +116,29 @@ if not is_cpu():
token during the process of validating the vLLM decoded output. token during the process of validating the vLLM decoded output.
''' '''
test_case_prompts = example_encoder_decoder_prompts[ # NOTE: take care of the order. run vLLM first, and then run HF.
decoder_prompt_type] # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
prompts, max_tokens, num_logprobs)
# Configuration settings for HF baseline # Configuration settings for HF baseline
hf_kwargs = { hf_kwargs = {
...@@ -135,26 +156,12 @@ if not is_cpu(): ...@@ -135,26 +156,12 @@ if not is_cpu():
auto_cls=AutoModelForSeq2SeqLM) as hf_model: auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = ( hf_outputs = (
hf_model.generate_encoder_decoder_greedy_logprobs_limit( hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_case_prompts, prompts,
max_tokens, max_tokens,
num_logprobs, num_logprobs,
**hf_kwargs, **hf_kwargs,
)) ))
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_case_prompts, max_tokens, num_logprobs)
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
else 0) else 0)
...@@ -168,3 +175,49 @@ if not is_cpu(): ...@@ -168,3 +175,49 @@ if not is_cpu():
name_1="vllm", name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens, num_outputs_0_skip_tokens=hf_skip_tokens,
) )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
model, dtype, max_tokens, num_logprobs,
decoder_prompt_type) -> None:
run_test(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts[decoder_prompt_type],
decoder_prompt_type,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
def test_models_distributed(hf_runner, vllm_runner,
example_encoder_decoder_prompts,
distributed_executor_backend, model, dtype,
max_tokens, num_logprobs,
decoder_prompt_type) -> None:
run_test(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts[decoder_prompt_type],
decoder_prompt_type,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
...@@ -10,6 +10,7 @@ from pathlib import Path ...@@ -10,6 +10,7 @@ from pathlib import Path
from typing import Any, Callable, Dict, List, Optional from typing import Any, Callable, Dict, List, Optional
import openai import openai
import pytest
import requests import requests
from openai.types.completion import Completion from openai.types.completion import Completion
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.model_executor.model_loader.loader import get_model_loader from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
get_open_port, is_hip)
if current_platform.is_rocm(): if current_platform.is_rocm():
from amdsmi import (amdsmi_get_gpu_vram_usage, from amdsmi import (amdsmi_get_gpu_vram_usage,
...@@ -452,6 +454,22 @@ def fork_new_process_for_each_test( ...@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
return wrapper return wrapper
def multi_gpu_test(*, num_gpus: int):
"""
Decorate a test to be run only when multiple GPUs are available.
"""
test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
test_skipif = pytest.mark.skipif(
cuda_device_count_stateless() < num_gpus,
reason=f"Need at least {num_gpus} GPUs to run the test.",
)
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
return test_selector(test_skipif(fork_new_process_for_each_test(f)))
return wrapper
async def completions_with_server_args( async def completions_with_server_args(
prompts: List[str], prompts: List[str],
model_name: str, model_name: str,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment