Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
"""Tests for phi3v's multimodal preprocessing kwargs.""" """Tests for phi3v's multimodal preprocessing kwargs."""
import pytest import pytest
...@@ -37,7 +38,10 @@ def test_processor_override( ...@@ -37,7 +38,10 @@ def test_processor_override(
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
......
# SPDX-License-Identifier: Apache-2.0
import pytest import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
...@@ -31,7 +33,10 @@ def test_processor_override( ...@@ -31,7 +33,10 @@ def test_processor_override(
mm_processor_kwargs=None, mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
......
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import AbstractSet, Any, Literal, Mapping, Optional from typing import AbstractSet, Any, Literal, Mapping, Optional
...@@ -222,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = { ...@@ -222,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
_MULTIMODAL_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only] # [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
min_transformers_version="4.48"),
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
"ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
...@@ -263,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -263,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
min_transformers_version="4.49"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
trust_remote_code=True), trust_remote_code=True),
# [Encoder-decoder] # [Encoder-decoder]
...@@ -276,7 +279,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { ...@@ -276,7 +279,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"MedusaModel": _HfExamplesInfo("JackFram/llama-68m", "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501
"MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501
}
_FALLBACK_MODEL = {
"TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
} }
_EXAMPLE_MODELS = { _EXAMPLE_MODELS = {
...@@ -285,6 +292,7 @@ _EXAMPLE_MODELS = { ...@@ -285,6 +292,7 @@ _EXAMPLE_MODELS = {
**_CROSS_ENCODER_EXAMPLE_MODELS, **_CROSS_ENCODER_EXAMPLE_MODELS,
**_MULTIMODAL_EXAMPLE_MODELS, **_MULTIMODAL_EXAMPLE_MODELS,
**_SPECULATIVE_DECODING_EXAMPLE_MODELS, **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
**_FALLBACK_MODEL,
} }
......
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import pytest import pytest
...@@ -13,7 +15,9 @@ def test_plugin(dummy_opt_path): ...@@ -13,7 +15,9 @@ def test_plugin(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = "" os.environ["VLLM_PLUGINS"] = ""
with pytest.raises(Exception) as excinfo: with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy") LLM(model=dummy_opt_path, load_format="dummy")
assert "are not supported for now" in str(excinfo.value) error_msg = "has no vLLM implementation and " \
"the Transformers implementation is not compatible with vLLM."
assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test @fork_new_process_for_each_test
......
# SPDX-License-Identifier: Apache-2.0
import warnings import warnings
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0
"""Test the functionality of the Transformers backend.
Run `pytest tests/models/test_transformers.py`.
"""
from contextlib import nullcontext
from typing import Type
import pytest
from ..conftest import HfRunner, VllmRunner
from ..utils import multi_gpu_test
from .utils import check_logprobs_close
def check_implementation(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
example_prompts: list[str],
model: str,
**kwargs,
):
max_tokens = 32
num_logprobs = 5
with vllm_runner(model, **kwargs) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize(
"model,model_impl",
[
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("openai-community/gpt2", "transformers"),
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
("meta-llama/Llama-3.2-1B-Instruct", "auto"),
]) # trust_remote_code=True by default
def test_models(hf_runner, vllm_runner, example_prompts, model,
model_impl) -> None:
maybe_raises = nullcontext()
if model == "openai-community/gpt2" and model_impl == "transformers":
# Model is not backend compatible
maybe_raises = pytest.raises(
ValueError,
match="The Transformers implementation.*not compatible with vLLM")
with maybe_raises:
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
@multi_gpu_test(num_gpus=2)
def test_distributed(
hf_runner,
vllm_runner,
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
check_implementation(hf_runner, vllm_runner, example_prompts,
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)
# SPDX-License-Identifier: Apache-2.0
import warnings import warnings
from typing import Dict, List, Optional, Sequence, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
"""Test that aborting is handled properly.""" """Test that aborting is handled properly."""
import asyncio import asyncio
......
# SPDX-License-Identifier: Apache-2.0
"""Test that various errors are handled properly.""" """Test that various errors are handled properly."""
import asyncio import asyncio
......
# SPDX-License-Identifier: Apache-2.0
"""Test that the MQLLMEngine is able to handle 10k concurrent requests.""" """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
import asyncio import asyncio
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import multiprocessing import multiprocessing
from typing import Callable, Tuple, Union from typing import Callable, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
# Test the AsyncLLMEngine with multi-step-decoding # Test the AsyncLLMEngine with multi-step-decoding
from typing import List, Optional from typing import List, Optional
......
# SPDX-License-Identifier: Apache-2.0
# Test the LLMEngine with multi-step-decoding # Test the LLMEngine with multi-step-decoding
import copy import copy
......
# SPDX-License-Identifier: Apache-2.0
import torch import torch
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
......
# SPDX-License-Identifier: Apache-2.0
from contextlib import nullcontext from contextlib import nullcontext
from types import MethodType
from typing import cast from typing import cast
from unittest.mock import MagicMock from unittest.mock import MagicMock
import numpy as np import numpy as np
import pytest import pytest
from transformers import ProcessorMixin
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
...@@ -634,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ...@@ -634,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
mm_data=mm_data, mm_data=mm_data,
hf_processor_mm_kwargs={}, hf_processor_mm_kwargs={},
) )
class _ProcessorProxy:
def __init__(self, processor: ProcessorMixin) -> None:
super().__init__()
self.__processor = processor
def __getattr__(self, key: str):
return getattr(self.__processor, key)
def __call__(
self,
text=None,
images=None,
videos=None,
exists=None,
return_tensors=None,
):
return dict(exists=exists)
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"]) # Dummy
# yapf: disable
@pytest.mark.parametrize(
("call_kwargs", "expected_kwargs"),
[
# Should ignore invalid kwargs
({"does_not_exist": 100}, {"exists": None}),
({"exists": 1}, {"exists": 1}),
({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
],
)
# yapf: enable
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
model_config = ModelConfig(
model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
revision=None,
)
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
)
orig_get_hf_processor = processor.info.get_hf_processor
def get_hf_processor(self, **kwargs):
assert kwargs == call_kwargs
return _ProcessorProxy(orig_get_hf_processor())
processor.info.get_hf_processor = MethodType(get_hf_processor,
processor.info)
out_kwargs = processor._call_hf_processor(
prompt="",
mm_data={},
mm_kwargs=call_kwargs,
)
assert out_kwargs == expected_kwargs
# SPDX-License-Identifier: Apache-2.0
import base64 import base64
import mimetypes import mimetypes
import os import os
......
# SPDX-License-Identifier: Apache-2.0
import numpy as np import numpy as np
from PIL import Image from PIL import Image
......
# SPDX-License-Identifier: Apache-2.0
import random import random
from typing import Optional from typing import Optional
......
# SPDX-License-Identifier: Apache-2.0
from setuptools import setup from setuptools import setup
setup(name='vllm_add_dummy_model', setup(name='vllm_add_dummy_model',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment