test_processor.py 5.08 KB
Newer Older
1
2
import numpy as np
import pytest
3
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
4
5
6
7
8

from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import ImagePixelData

9
from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
10

11
12

@pytest.mark.parametrize("dtype", ["half", "float"])
13
def test_clip_image_processor(image_assets, dtype):
14
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
15
    IMAGE_HEIGHT = IMAGE_WIDTH = 560
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=32000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=576,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

38
    for asset in image_assets:
39
        hf_result = hf_processor.preprocess(
40
            asset.pil_image,
41
42
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
43
        vllm_result = MULTIMODAL_REGISTRY.process_input(
44
            ImagePixelData(asset.pil_image),
45
46
47
48
49
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert hf_result.keys() == vllm_result.keys()
50
51
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
52
53
54
55
56
57
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


58
59
60
61
@pytest.mark.xfail(
    reason="Inconsistent image processor being used due to lack "
    "of support for dynamic image token replacement")
@pytest.mark.parametrize("dtype", ["half", "float"])
62
def test_llava_next_image_processor(image_assets, dtype):
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=64000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=2928,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

87
    for asset in image_assets:
88
        hf_result = hf_processor.preprocess(
89
            asset.pil_image,
90
91
92
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.process_input(
93
            ImagePixelData(asset.pil_image),
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert hf_result.keys() == vllm_result.keys()
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


@pytest.mark.xfail(
    reason="Example image pixels were not processed using HuggingFace")
109
@pytest.mark.parametrize("dtype", ["float"])
110
def test_image_pixel_types(image_assets, dtype):
111
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
112
    IMAGE_HEIGHT = IMAGE_WIDTH = 560
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=32000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=576,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

132
    for asset in image_assets:
133
        image_result = MULTIMODAL_REGISTRY.process_input(
134
            ImagePixelData(asset.pil_image),
135
136
137
138
            model_config=model_config,
            vlm_config=vlm_config,
        )
        tensor_result = MULTIMODAL_REGISTRY.process_input(
139
            ImagePixelData(asset.pixel_values),
140
141
142
143
144
145
146
147
148
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert image_result.keys() == tensor_result.keys()
        for key, image_arr in image_result.items():
            tensor_arr: np.ndarray = tensor_result[key].numpy()

            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
149
            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"