Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
+# SPDX-License-Identifier: Apache-2.0
 """Tests for phi3v's multimodal preprocessing kwargs."""
 import pytest
@@ -37,7 +38,10 @@ def test_processor_override(
        trust_remote_code=True,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,

--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -31,7 +33,10 @@ def test_processor_override(
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
+# SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass, field
 from typing import AbstractSet, Any, Literal, Mapping, Optional
@@ -222,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
-    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-                                                    min_transformers_version="4.48"),
    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
    "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
@@ -263,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                       trust_remote_code=True),
    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
+                                                          min_transformers_version="4.49"),  # noqa: E501
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
                                     trust_remote_code=True),
    # [Encoder-decoder]
@@ -276,7 +279,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
    "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
-                                                    speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
+                                                    speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
+}
+_FALLBACK_MODEL = {
+    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 _EXAMPLE_MODELS = {
@@ -285,6 +292,7 @@ _EXAMPLE_MODELS = {
    **_CROSS_ENCODER_EXAMPLE_MODELS,
    **_MULTIMODAL_EXAMPLE_MODELS,
    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
+    **_FALLBACK_MODEL,
 }

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
+# SPDX-License-Identifier: Apache-2.0
 from unittest.mock import patch
 import pytest

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import pytest
@@ -13,7 +15,9 @@ def test_plugin(dummy_opt_path):
    os.environ["VLLM_PLUGINS"] = ""
    with pytest.raises(Exception) as excinfo:
        LLM(model=dummy_opt_path, load_format="dummy")
-    assert "are not supported for now" in str(excinfo.value)
+    error_msg = "has no vLLM implementation and " \
+                "the Transformers implementation is not compatible with vLLM."
+    assert (error_msg in str(excinfo.value))
 @fork_new_process_for_each_test

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
+# SPDX-License-Identifier: Apache-2.0
 import warnings
 import pytest

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
+# SPDX-License-Identifier: Apache-2.0
+"""Test the functionality of the Transformers backend.
+Run `pytest tests/models/test_transformers.py`.
+"""
+from contextlib import nullcontext
+from typing import Type
+import pytest
+from ..conftest import HfRunner, VllmRunner
+from ..utils import multi_gpu_test
+from .utils import check_logprobs_close
+def check_implementation(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    **kwargs,
+):
+    max_tokens = 32
+    num_logprobs = 5
+    with vllm_runner(model, **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
+        ("openai-community/gpt2", "transformers"),
+        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
+        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
+    ])  # trust_remote_code=True by default
+def test_models(hf_runner, vllm_runner, example_prompts, model,
+                model_impl) -> None:
+    maybe_raises = nullcontext()
+    if model == "openai-community/gpt2" and model_impl == "transformers":
+        # Model is not backend compatible
+        maybe_raises = pytest.raises(
+            ValueError,
+            match="The Transformers implementation.*not compatible with vLLM")
+    with maybe_raises:
+        check_implementation(hf_runner,
+                             vllm_runner,
+                             example_prompts,
+                             model,
+                             model_impl=model_impl)
+@multi_gpu_test(num_gpus=2)
+def test_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union

--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
+# SPDX-License-Identifier: Apache-2.0
 """Test that aborting is handled properly."""
 import asyncio

--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
+# SPDX-License-Identifier: Apache-2.0
 """Test that various errors are handled properly."""
 import asyncio

--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
+# SPDX-License-Identifier: Apache-2.0
 """Test that the MQLLMEngine is able to handle 10k concurrent requests."""
 import asyncio

--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import multiprocessing
 from typing import Callable, Tuple, Union

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
+# SPDX-License-Identifier: Apache-2.0
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import List, Optional

--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
+# SPDX-License-Identifier: Apache-2.0
 # Test the LLMEngine with multi-step-decoding
 import copy

--- a/tests/multimodal/test_inputs.py
+++ b/tests/multimodal/test_inputs.py
+# SPDX-License-Identifier: Apache-2.0
 import torch
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors

--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
+# SPDX-License-Identifier: Apache-2.0
 from contextlib import nullcontext
+from types import MethodType
 from typing import cast
 from unittest.mock import MagicMock
 import numpy as np
 import pytest
+from transformers import ProcessorMixin
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -634,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
+class _ProcessorProxy:
+    def __init__(self, processor: ProcessorMixin) -> None:
+        super().__init__()
+        self.__processor = processor
+    def __getattr__(self, key: str):
+        return getattr(self.__processor, key)
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        videos=None,
+        exists=None,
+        return_tensors=None,
+    ):
+        return dict(exists=exists)
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"])  # Dummy
+# yapf: disable
+@pytest.mark.parametrize(
+    ("call_kwargs", "expected_kwargs"),
+    [
+        # Should ignore invalid kwargs
+        ({"does_not_exist": 100}, {"exists": None}),
+        ({"exists": 1}, {"exists": 1}),
+        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
+    model_config = ModelConfig(
+        model=model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        model_config,
+        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+    )
+    orig_get_hf_processor = processor.info.get_hf_processor
+    def get_hf_processor(self, **kwargs):
+        assert kwargs == call_kwargs
+        return _ProcessorProxy(orig_get_hf_processor())
+    processor.info.get_hf_processor = MethodType(get_hf_processor,
+                                                 processor.info)
+    out_kwargs = processor._call_hf_processor(
+        prompt="",
+        mm_data={},
+        mm_kwargs=call_kwargs,
+    )
+    assert out_kwargs == expected_kwargs
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
 import base64
 import mimetypes
 import os

--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import numpy as np
 from PIL import Image

--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
+# SPDX-License-Identifier: Apache-2.0
 import random
 from typing import Optional

--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
+# SPDX-License-Identifier: Apache-2.0
 from setuptools import setup
 setup(name='vllm_add_dummy_model',