test_processor.py 5.05 KB
Newer Older
1
2
import numpy as np
import pytest
3
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
4
5
6
7
8

from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import ImagePixelData

9
from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
10

11
12

@pytest.mark.parametrize("dtype", ["half", "float"])
13
14
def test_clip_image_processor(hf_images, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
15
    IMAGE_HEIGHT = IMAGE_WIDTH = 560
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=32000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=576,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

    for image in hf_images:
        hf_result = hf_processor.preprocess(
            image,
41
42
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
43
44
45
46
47
48
49
        vllm_result = MULTIMODAL_REGISTRY.process_input(
            ImagePixelData(image),
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert hf_result.keys() == vllm_result.keys()
50
51
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
52
53
54
55
56
57
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
@pytest.mark.xfail(
    reason="Inconsistent image processor being used due to lack "
    "of support for dynamic image token replacement")
@pytest.mark.parametrize("dtype", ["half", "float"])
def test_llava_next_image_processor(hf_images, dtype):
    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=64000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=2928,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

    for image in hf_images:
        hf_result = hf_processor.preprocess(
            image,
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.process_input(
            ImagePixelData(image),
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert hf_result.keys() == vllm_result.keys()
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


@pytest.mark.xfail(
    reason="Example image pixels were not processed using HuggingFace")
109
110
111
@pytest.mark.parametrize("dtype", ["float"])
def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
112
    IMAGE_HEIGHT = IMAGE_WIDTH = 560
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
    )
    vlm_config = VisionLanguageConfig(
        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
        image_token_id=32000,
        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
        image_feature_size=576,
        image_processor=MODEL_NAME,
        image_processor_revision=None,
    )

    for image, tensor in zip(hf_images, vllm_image_tensors):
        image_result = MULTIMODAL_REGISTRY.process_input(
            ImagePixelData(image),
            model_config=model_config,
            vlm_config=vlm_config,
        )
        tensor_result = MULTIMODAL_REGISTRY.process_input(
            ImagePixelData(tensor),
            model_config=model_config,
            vlm_config=vlm_config,
        )

        assert image_result.keys() == tensor_result.keys()
        for key, image_arr in image_result.items():
            tensor_arr: np.ndarray = tensor_result[key].numpy()

            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
149
            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"