Merge tag 'v0.6.1.post2' into v0.6.1.post2-dev

ad58e9b3 · zhuwenwen · 408f663a · 9ba0817f · ad58e9b3 · ad58e9b3
Commit ad58e9b3 authored Sep 18, 2024 by zhuwenwen
20 changed files
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -10,7 +10,7 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
-from ..models.utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
@@ -11,7 +11,7 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -15,7 +15,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -10,9 +10,10 @@ from dataclasses import dataclass
 import pytest
-from tests.models.utils import check_logprobs_close
 from tests.quantization.utils import is_quant_method_supported
+from ...utils import check_logprobs_close
 @dataclass
 class ModelPair:

--- a/tests/models/test_granite.py
+++ b/tests/models/test_granite.py
@@ -6,7 +6,7 @@ import importlib.metadata
 import pytest
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 TRANSFORMERS_VERSION = tuple(
    map(int,

--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
 import pytest
-from tests.models.utils import check_outputs_equal
 from vllm.worker.model_runner import _get_graph_batch_size
+from ...utils import check_outputs_equal
 MODELS = ["ai21labs/Jamba-tiny-random"]

--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -16,7 +16,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 @dataclass

--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",

--- a/tests/models/test_modelopt.py
+++ b/tests/models/test_modelopt.py
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = [
    "facebook/opt-125m",

--- a/tests/models/test_phimoe.py
+++ b/tests/models/test_phimoe.py
@@ -7,7 +7,7 @@ import torch
 from vllm.utils import is_cpu
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 MODELS = [
    "microsoft/Phi-3.5-MoE-instruct",

--- a/tests/models/decoder_only/vision_language/__init__.py
+++ b/tests/models/decoder_only/vision_language/__init__.py
--- a/tests/models/test_blip2.py
+++ b/tests/models/test_blip2.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS
+from ....conftest import IMAGE_ASSETS
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalData objects and corresponding
    MultiModalConfig as input.

--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-Run:
-```sh
-pytest -s -v test_multimodal_broadcast.py
-```
-"""
 import pytest
-from vllm.utils import cuda_device_count_stateless
+from ....utils import multi_gpu_test
-from ..utils import fork_new_process_for_each_test
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
+@multi_gpu_test(num_gpus=2)
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model, distributed_executor_backend", [
+@pytest.mark.parametrize("model", [
-    ("llava-hf/llava-1.5-7b-hf", "ray"),
+    "llava-hf/llava-1.5-7b-hf",
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
+    "llava-hf/llava-v1.6-mistral-7b-hf",
-    ("facebook/chameleon-7b", "ray"),
+    "facebook/chameleon-7b",
-    ("llava-hf/llava-1.5-7b-hf", "mp"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
-    ("facebook/chameleon-7b", "mp"),
 ])
-@fork_new_process_for_each_test
+def test_models(hf_runner, vllm_runner, image_assets,
-def test_models(hf_runner, vllm_runner, image_assets, model: str,
+                distributed_executor_backend, model) -> None:
-                distributed_executor_backend: str) -> None:
    dtype = "half"
    max_tokens = 5
@@ -33,13 +19,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model: str,
    tensor_parallel_size = 2
    if model.startswith("llava-hf/llava-1.5"):
-        from ..models.test_llava import models, run_test
+        from .test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
+        from .test_llava_next import models, run_test  # type: ignore[no-redef]
-        from ..models.test_llava_next import models
    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
+        from .test_chameleon import models, run_test  # type: ignore[no-redef]
-        from ..models.test_chameleon import models
    else:
        raise NotImplementedError(f"Unsupported model: {model}")

--- a/tests/models/test_chameleon.py
+++ b/tests/models/test_chameleon.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -36,7 +34,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.

--- a/tests/models/test_fuyu.py
+++ b/tests/models/test_fuyu.py
@@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -46,7 +44,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.

--- a/tests/models/test_intern_vit.py
+++ b/tests/models/test_intern_vit.py
@@ -6,9 +6,7 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
-from ..conftest import _ImageAssets, cleanup
+from ....conftest import _ImageAssets, cleanup
-pytestmark = pytest.mark.vlm
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner

--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -9,11 +9,9 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -78,7 +76,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -331,6 +329,41 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
    )
+@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
+@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
+                               size_factors, dtype: str, max_tokens: int,
+                               num_logprobs: int) -> None:
+    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
+    inputs_batching = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_multi_images = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+    for inputs in [inputs_batching, inputs_multi_images]:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            mm_limit=2,
+            tensor_parallel_size=1,
+        )
 @pytest.mark.parametrize(
    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
 @pytest.mark.parametrize(

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -8,11 +8,9 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
@@ -143,7 +141,7 @@ def _run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -239,7 +237,7 @@ def _run_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+                dtype, max_tokens, num_logprobs) -> None:
    run_test(
        hf_runner,
        vllm_runner,

--- a/tests/models/test_llava_image_embeds.py
+++ b/tests/models/test_llava_image_embeds.py
@@ -5,10 +5,8 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
-pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -62,7 +60,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.