processor.py 15.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import importlib
import inspect
6
from functools import lru_cache
7
from typing import TYPE_CHECKING, Any, cast, get_args, get_type_hints
8

9
10
11
12
13
14
from transformers import (
    AutoFeatureExtractor,
    AutoImageProcessor,
    AutoProcessor,
    AutoVideoProcessor,
)
15
16
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.image_processing_utils import BaseImageProcessor
17
from transformers.processing_utils import ProcessorMixin
18
from transformers.video_processing_utils import BaseVideoProcessor
19
20
from typing_extensions import TypeVar

21
from vllm.logger import init_logger
22
23
from vllm.transformers_utils.gguf_utils import is_gguf
from vllm.transformers_utils.utils import convert_model_repo_to_path
24
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
25

26
27
logger = init_logger(__name__)

28
if TYPE_CHECKING:
29
    from vllm.config import ModelConfig
30
31

_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
32
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
33
34
35
36
37
38
39
40
41
42
43
44
45


class HashableDict(dict):
    """
    A dictionary that can be hashed by lru_cache.
    """

    # NOTE: pythonic dict is not hashable,
    # we override on it directly for simplicity
    def __hash__(self) -> int:  # type: ignore[override]
        return hash(frozenset(self.items()))


46
47
48
49
50
51
52
53
54
class HashableList(list):
    """
    A list that can be hashed by lru_cache.
    """

    def __hash__(self) -> int:  # type: ignore[override]
        return hash(tuple(self))


55
def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
56
57
58
59
60
61
62
    if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
        return AutoProcessor.from_pretrained
    if hasattr(processor_cls, "from_pretrained"):
        return processor_cls.from_pretrained

    return processor_cls

63

64
65
66
67
68
69
70
71
72
73
@lru_cache
def _collect_dynamic_keys_from_processing_kwargs(kwargs_cls: type) -> set[str]:
    dynamic_kwargs: set[str] = set()
    if kwargs_cls is None:
        return dynamic_kwargs
    # get kwargs annotations in processor
    # merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
    kwargs_type_annotations = get_type_hints(kwargs_cls)
    for kw_type in ("text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"):
        if kw_type in kwargs_type_annotations:
74
75
76
77
78
79
80
            # Use __annotations__ instead of get_type_hints() to avoid
            # NameError from unresolved forward references (e.g.
            # PILImageResampling). We only need key names, not types.
            kw_cls = kwargs_type_annotations[kw_type]
            kw_annotations: dict[str, Any] = {}
            for base in reversed(kw_cls.__mro__):
                kw_annotations.update(getattr(base, "__annotations__", {}))
81
82
83
84
85
86
            for kw_name in kw_annotations:
                dynamic_kwargs.add(kw_name)
    dynamic_kwargs |= {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
    return dynamic_kwargs


87
88
def _merge_mm_kwargs(
    model_config: "ModelConfig",
89
    processor_cls: type | tuple[type, ...],
90
91
92
93
94
95
96
97
98
99
100
101
102
    /,
    **kwargs,
):
    mm_config = model_config.get_multimodal_config()
    merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)

    factory = _get_processor_factory_fn(processor_cls)
    allowed_kwargs = get_allowed_kwarg_only_overrides(
        factory,
        merged_kwargs,
        requires_kw_only=False,
        allow_var_kwargs=True,
    )
103
104
105
    # NOTE: Pythonic dict is not hashable and will raise unhashable type
    # error when calling `cached_get_processor`, therefore we need to
    # wrap it to a hashable dict.
106
    for key, value in allowed_kwargs.items():
107
        if isinstance(value, dict):
108
            allowed_kwargs[key] = HashableDict(value)
109
        if isinstance(value, list):
110
111
112
            allowed_kwargs[key] = HashableList(value)

    return allowed_kwargs
113

114
115
116

def get_processor(
    processor_name: str,
117
    *args: Any,
118
    revision: str | None = None,
119
    trust_remote_code: bool = False,
120
    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
121
    **kwargs: Any,
122
) -> _P:
123
    """Load a processor for the given model name via HuggingFace."""
124
125
    if revision is None:
        revision = "main"
126
    try:
127
        processor_name = convert_model_repo_to_path(processor_name)
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
        if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
            processor = AutoProcessor.from_pretrained(
                processor_name,
                *args,
                revision=revision,
                trust_remote_code=trust_remote_code,
                **kwargs,
            )
        elif issubclass(processor_cls, ProcessorMixin):
            processor = processor_cls.from_pretrained(
                processor_name,
                *args,
                revision=revision,
                trust_remote_code=trust_remote_code,
                **kwargs,
            )
        else:
            # Processors that are standalone classes unrelated to HF
            processor = processor_cls(*args, **kwargs)
147
148
149
150
151
152
153
154
155
156
    except ValueError as e:
        # If the error pertains to the processor class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
        if not trust_remote_code:
            err_msg = (
                "Failed to load the processor. If the processor is "
                "a custom processor not yet available in the HuggingFace "
                "transformers library, consider setting "
                "`trust_remote_code=True` in LLM or using the "
157
158
                "`--trust-remote-code` flag in the CLI."
            )
159
160
161
162
            raise RuntimeError(err_msg) from e
        else:
            raise e

163
    if not isinstance(processor, processor_cls):
164
165
166
167
168
        raise TypeError(
            "Invalid type of HuggingFace processor. "
            f"Expected type: {processor_cls}, but "
            f"found type: {type(processor)}"
        )
169
170

    return processor
171
172


173
174
175
cached_get_processor = lru_cache(get_processor)


176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
@lru_cache
def get_processor_kwargs_from_processor(processor: _P) -> set[str]:
    try:
        # get kwargs annotations in processor
        call_kwargs = inspect.signature(type(processor).__call__).parameters.get(
            "kwargs"
        )
        call_kwargs_annotations = call_kwargs.annotation if call_kwargs else None
        # if the processor has explicit kwargs annotation, use it
        if call_kwargs_annotations not in (None, inspect._empty):
            # get_type_hints will parse all type annotations at runtime,
            # and if an annotation refers to a type or
            # name that hasn’t been imported or defined, it will raise an error.
            # So we use __annotations__ to get the raw annotations directly.
            return _collect_dynamic_keys_from_processing_kwargs(
                get_args(call_kwargs_annotations)[0]
            )
        # otherwise, try to get from ProcessingKwargs
        else:
            module_name = type(processor).__module__
            mod = importlib.import_module(module_name)
            # find *ProcessingKwargs in the module
            processor_kwargs: set[str] = set()
            for name, obj in vars(mod).items():
                if name.endswith("ProcessingKwargs"):
                    processor_kwargs = (
                        processor_kwargs
                        | _collect_dynamic_keys_from_processing_kwargs(obj)
                    )
            return processor_kwargs
    except Exception:
207
        logger.exception("Failed to collect processor kwargs")
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
        return set()


def cached_get_processor_without_dynamic_kwargs(
    processor_name: str,
    *args: Any,
    revision: str | None = None,
    trust_remote_code: bool = False,
    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
    **kwargs: Any,
) -> _P:
    # Step 1: use default kwargs to get a temporary processor instance
    processor = cached_get_processor(
        processor_name,
        revision=revision,
        trust_remote_code=trust_remote_code,
        processor_cls=processor_cls,  # type: ignore[arg-type]
    )

    # Step 2: use temporary processor collect dynamic keys
    dynamic_keys = get_processor_kwargs_from_processor(processor)

    # Step 3: use dynamic_keys filter kwargs
    filtered_kwargs = {k: v for k, v in kwargs.items() if k not in dynamic_keys}

    # Step 4: use filtered kwargs to get final processor instance
    final_processor = cached_get_processor(
        processor_name,
        revision=revision,
        trust_remote_code=trust_remote_code,
        processor_cls=processor_cls,  # type: ignore[arg-type]
        **filtered_kwargs,
    )

    return final_processor


245
def cached_processor_from_config(
246
    model_config: "ModelConfig",
247
    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
248
249
    **kwargs: Any,
) -> _P:
250
    if is_gguf(model_config.model):
251
        assert not is_gguf(model_config.tokenizer), (
252
253
254
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load processor."
        )
255
256
        model = model_config.tokenizer
        revision = model_config.tokenizer_revision
257
258
259
260
    else:
        model = model_config.model
        revision = model_config.revision

261
    return cached_get_processor_without_dynamic_kwargs(
262
263
        model,
        revision=revision,
264
265
        trust_remote_code=model_config.trust_remote_code,
        processor_cls=processor_cls,  # type: ignore[arg-type]
266
        **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
267
268
269
    )


270
271
272
def get_feature_extractor(
    processor_name: str,
    *args: Any,
273
    revision: str | None = None,
274
275
276
    trust_remote_code: bool = False,
    **kwargs: Any,
):
277
    """Load an audio feature extractor for the given model name
278
279
    via HuggingFace."""
    try:
280
        processor_name = convert_model_repo_to_path(processor_name)
281
282
283
        feature_extractor = AutoFeatureExtractor.from_pretrained(
            processor_name,
            *args,
284
            revision=revision,
285
            trust_remote_code=trust_remote_code,
286
287
            **kwargs,
        )
288
289
290
291
292
293
294
295
296
297
    except ValueError as e:
        # If the error pertains to the processor class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
        if not trust_remote_code:
            err_msg = (
                "Failed to load the feature extractor. If the feature "
                "extractor is a custom extractor not yet available in the "
                "HuggingFace transformers library, consider setting "
                "`trust_remote_code=True` in LLM or using the "
298
299
                "`--trust-remote-code` flag in the CLI."
            )
300
301
302
303
304
305
306
307
308
309
            raise RuntimeError(err_msg) from e
        else:
            raise e
    return cast(FeatureExtractionMixin, feature_extractor)


cached_get_feature_extractor = lru_cache(get_feature_extractor)


def cached_feature_extractor_from_config(
310
    model_config: "ModelConfig",
311
312
313
314
    **kwargs: Any,
):
    return cached_get_feature_extractor(
        model_config.model,
315
        revision=model_config.revision,
316
        trust_remote_code=model_config.trust_remote_code,
317
        **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
318
319
320
    )


321
322
323
def get_image_processor(
    processor_name: str,
    *args: Any,
324
    revision: str | None = None,
325
326
327
328
329
    trust_remote_code: bool = False,
    **kwargs: Any,
):
    """Load an image processor for the given model name via HuggingFace."""
    try:
330
        processor_name = convert_model_repo_to_path(processor_name)
331
332
333
        processor = AutoImageProcessor.from_pretrained(
            processor_name,
            *args,
334
            revision=revision,
335
            trust_remote_code=trust_remote_code,
336
337
            **kwargs,
        )
338
339
340
341
342
343
344
345
346
347
    except ValueError as e:
        # If the error pertains to the processor class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
        if not trust_remote_code:
            err_msg = (
                "Failed to load the image processor. If the image processor is "
                "a custom processor not yet available in the HuggingFace "
                "transformers library, consider setting "
                "`trust_remote_code=True` in LLM or using the "
348
349
                "`--trust-remote-code` flag in the CLI."
            )
350
351
352
353
354
355
356
            raise RuntimeError(err_msg) from e
        else:
            raise e

    return cast(BaseImageProcessor, processor)


357
358
359
360
cached_get_image_processor = lru_cache(get_image_processor)


def cached_image_processor_from_config(
361
    model_config: "ModelConfig",
362
363
    **kwargs: Any,
):
364
    if is_gguf(model_config.model):
365
        assert not is_gguf(model_config.tokenizer), (
366
367
368
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load image processor."
        )
369
370
        model = model_config.tokenizer
        revision = model_config.tokenizer_revision
371
372
373
    else:
        model = model_config.model
        revision = model_config.revision
374
    return cached_get_image_processor(
375
376
        model,
        revision=revision,
377
        trust_remote_code=model_config.trust_remote_code,
378
        **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
379
    )
380
381
382
383
384


def get_video_processor(
    processor_name: str,
    *args: Any,
385
    revision: str | None = None,
386
    trust_remote_code: bool = False,
387
    processor_cls_overrides: type[_V] | None = None,
388
389
390
391
    **kwargs: Any,
):
    """Load a video processor for the given model name via HuggingFace."""
    try:
392
        processor_name = convert_model_repo_to_path(processor_name)
393
394
395
396
397
398
        processor_cls = processor_cls_overrides or AutoVideoProcessor
        processor = processor_cls.from_pretrained(
            processor_name,
            *args,
            revision=revision,
            trust_remote_code=trust_remote_code,
399
400
            **kwargs,
        )
401
402
403
404
405
406
407
408
409
410
    except ValueError as e:
        # If the error pertains to the processor class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        # Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
        if not trust_remote_code:
            err_msg = (
                "Failed to load the video processor. If the video processor is "
                "a custom processor not yet available in the HuggingFace "
                "transformers library, consider setting "
                "`trust_remote_code=True` in LLM or using the "
411
412
                "`--trust-remote-code` flag in the CLI."
            )
413
414
415
416
417
418
419
420
421
422
423
            raise RuntimeError(err_msg) from e
        else:
            raise e

    return cast(BaseVideoProcessor, processor)


cached_get_video_processor = lru_cache(get_video_processor)


def cached_video_processor_from_config(
424
    model_config: "ModelConfig",
425
    processor_cls: type[_V] | None = None,
426
427
428
429
430
431
432
433
434
    **kwargs: Any,
):
    return cached_get_video_processor(
        model_config.model,
        revision=model_config.revision,
        trust_remote_code=model_config.trust_remote_code,
        processor_cls_overrides=processor_cls,  # type: ignore[arg-type]
        **_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
    )