preprocess.py 24.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from collections.abc import Mapping
5
from typing import Any, Optional, Union, cast
6
7
8
9
10

from typing_extensions import assert_never

from vllm.config import ModelConfig
from vllm.logger import init_logger
11
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
12
from vllm.multimodal.cache import BaseMultiModalProcessorCache
13
14
15
16
17
18
from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalEncDecInputs,
    MultiModalInputs,
    MultiModalUUIDDict,
)
19
from vllm.multimodal.processing import BaseMultiModalProcessor
20
from vllm.transformers_utils.tokenizer import AnyTokenizer
21
from vllm.utils.jsontree import json_iter_leaves
22
from vllm.v1.metrics.stats import MultiModalCacheStats
23

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from .data import (
    DecoderOnlyInputs,
    EmbedsInputs,
    EmbedsPrompt,
    EncoderDecoderInputs,
    ExplicitEncoderDecoderPrompt,
    ProcessorInputs,
    PromptType,
    SingletonInputs,
    SingletonPrompt,
    TextPrompt,
    TokenInputs,
    TokensPrompt,
    embeds_inputs,
    token_inputs,
)
40
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
41
42
43
44
45
46
47
48

logger = init_logger(__name__)


class InputPreprocessor:
    def __init__(
        self,
        model_config: ModelConfig,
49
        tokenizer: Optional[AnyTokenizer],
50
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
51
        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
52
53
54
55
    ) -> None:
        super().__init__()

        self.model_config = model_config
56
        self.tokenizer = tokenizer
57
        self.mm_registry = mm_registry
58
        self.mm_processor_cache = mm_processor_cache
59

60
61
        self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None

62
    def get_tokenizer(self) -> AnyTokenizer:
63
        if self.tokenizer is None:
64
65
66
            raise ValueError(
                "You cannot pass text prompts when `skip_tokenizer_init` is True"
            )
67
68
69

        return self.tokenizer

70
    def get_bos_token_id(self) -> Optional[int]:
71
        if self.tokenizer is None:
72
            logger.warning_once(
73
74
                "Using None for BOS token id because tokenizer is not initialized"
            )
75
76
            return None

77
        return self.tokenizer.bos_token_id
78

79
    def get_eos_token_id(self) -> Optional[int]:
80
        if self.tokenizer is None:
81
            logger.warning_once(
82
83
                "Using None for EOS token id because tokenizer is not initialized"
            )
84
85
            return None

86
        return self.tokenizer.eos_token_id
87
88

    def get_decoder_start_token_id(self) -> Optional[int]:
89
        """
90
91
92
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
93
        """
94

95
        if not self.model_config.is_encoder_decoder:
96
97
            logger.warning_once(
                "Using None for decoder start token id because "
98
99
                "this is not an encoder/decoder model."
            )
100
101
            return None

102
        if self.model_config is None or self.model_config.hf_config is None:
103
104
            logger.warning_once(
                "Using None for decoder start token id because "
105
106
                "model config is not available."
            )
107
108
            return None

109
110
111
        dec_start_token_id = getattr(
            self.model_config.hf_config, "decoder_start_token_id", None
        )
112
        if dec_start_token_id is None:
113
114
115
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
                "id because decoder start token id is not "
116
117
                "available."
            )
118
119
120
121
            dec_start_token_id = self.get_bos_token_id()

        return dec_start_token_id

122
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
123
        """
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.

        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.

        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.

        However, it is possible that in the future
144
        other models may have different or more
145
146
147
148
149
150
151
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.

        Returns:

        * prompt_token_ids
152
        """
153
154
155
156
157
158
159

        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]

    def _prepare_decoder_input_ids_for_generation(
        self,
160
161
        decoder_input_ids: Optional[list[int]],
    ) -> list[int]:
162
163
164
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.

165
166
167
168
        Based on:
        https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py
        specifically,
        `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

        Arguments:

        * decoder_input_ids: input token ids to preprocess

        Returns:

        * Processed token list
        """

        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None

        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()

187
188
189
190
        if (
            len(decoder_input_ids) == 0
            or decoder_input_ids[0] != decoder_start_token_id
        ):
191
192
193
194
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids

        return decoder_input_ids

195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
    def _get_tokenization_kw(
        self,
        overrides: Optional[dict[str, Any]] = None,
    ) -> dict[str, Any]:
        kwargs = dict[str, Any]()

        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
            kwargs["add_special_tokens"] = False

        if overrides:
            kwargs.update(overrides)

        return kwargs

212
213
214
    def _tokenize_prompt(
        self,
        prompt: str,
215
        tokenization_kwargs: Optional[dict[str, Any]] = None,
216
    ) -> list[int]:
217
218
219
220
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
221
        tokenizer = self.get_tokenizer()
222
        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
223

224
        encoder_config = self.model_config.encoder_config
225

226
        if encoder_config and encoder_config.get("do_lower_case", False):
227
228
            prompt = prompt.lower()

229
        return tokenizer.encode(prompt, **tokenization_kwargs)
230

231
    def _get_mm_tokenizer(self) -> AnyTokenizer:
232
233
234
235
236
        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
        # while using also multi-modal input
        if not self.tokenizer:
            return cast(AnyTokenizer, object())  # Dummy

237
238
        tokenizer = self.get_tokenizer()
        return tokenizer
239

240
241
242
    def _get_mm_processor(self) -> BaseMultiModalProcessor:
        if not hasattr(self, "_mm_processor"):
            tokenizer = self._get_mm_tokenizer()
243

244
245
246
247
248
249
250
            self._mm_processor = self.mm_registry.create_processor(
                self.model_config,
                tokenizer=tokenizer,
                cache=self.mm_processor_cache,
            )

        return self._mm_processor
251

252
253
    def _process_multimodal(
        self,
254
        prompt: Union[str, list[int]],
255
256
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
257
        tokenization_kwargs: Optional[dict[str, Any]] = None,
258
        *,
259
        mm_uuids: Optional[MultiModalUUIDDict] = None,
260
    ) -> MultiModalInputs:
261
262
263
264
        """
        Apply the model's multi-modal processor to a multi-modal prompt,
        returning the corresponding token IDs and metadata.
        """
265
        mm_processor = self._get_mm_processor()
266

267
268
269
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

270
        mm_input = mm_processor.apply(
271
272
273
274
            prompt,
            mm_data,
            hf_processor_mm_kwargs=mm_processor_kwargs,
            tokenization_kwargs=tokenization_kwargs,
275
            mm_uuids=mm_uuids,
276
        )
277
278
279
        mm_hashes = mm_input["mm_hashes"]

        # Validate that all mm items have a string as their hash
280
281
282
283
        contains_only_strings = all(
            isinstance(leaf, str) for leaf in json_iter_leaves(mm_hashes)
        )
        if not contains_only_strings:
284
285
286
            raise ValueError(
                f"mm_hashes must contain only strings, got: {mm_hashes}. "
                "This is likely due to an incorrect custom implementation of "
287
288
                "MultiModalProcessor.apply method."
            )
289
290

        return mm_input
291

292
293
294
295
    def _process_embeds(
        self,
        parsed_content: EmbedsPrompt,
    ) -> EmbedsInputs:
296
        if not self.model_config.enable_prompt_embeds:
297
298
299
            raise ValueError(
                "You must set `--enable-prompt-embeds` to input `prompt_embeds`."
            )
300
301

        prompt_embeds = parsed_content["prompt_embeds"]
302

303
304
305
306
307
308
309
310
        # prompt_embeds must be (seq_len, hidden_size), but if the user
        # passes in a batch of size 1, i.e. (1, seq_len, hidden_size),
        # we can unambiguously process the intent by squeezing the batch
        # dimension.
        if prompt_embeds.ndim == 3:
            prompt_embeds = prompt_embeds.squeeze(dim=0)

        if prompt_embeds.ndim != 2:
311
            raise ValueError("prompt_embeds must be of shape (seq_len, hidden_size).")
312

313
314
315
316
317
        # Tensors must be on CPU for serialization between processes
        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
        # hidden device transfer in the critical path of generation.
        prompt_embeds = prompt_embeds.cpu()

318
319
320
        return embeds_inputs(
            prompt_embeds=prompt_embeds, cache_salt=parsed_content.get("cache_salt")
        )
321

322
    def _truncate_inputs(
323
324
325
326
327
328
329
        self, inputs: list[int], tokenization_kwargs: Optional[dict[str, Any]] = None
    ) -> list[int]:
        if (
            not tokenization_kwargs
            or "truncation" not in tokenization_kwargs
            or self.tokenizer is None
        ):
330
331
332
333
334
335
336
337
338
            return inputs

        max_length = tokenization_kwargs["max_length"]

        if self.tokenizer.truncation_side == "left":
            return inputs[-max_length:]
        else:
            return inputs[:max_length]

339
340
341
    def _process_tokens(
        self,
        parsed_content: TokensPrompt,
342
        tokenization_kwargs: Optional[dict[str, Any]] = None,
343
        *,
344
        mm_uuids: Optional[MultiModalUUIDDict] = None,
345
    ) -> Union[TokenInputs, MultiModalInputs]:
346
        prompt_token_ids = self._truncate_inputs(
347
348
            parsed_content["prompt_token_ids"], tokenization_kwargs
        )
349
350

        inputs: Union[TokenInputs, MultiModalInputs]
351
        if self.model_config.is_multimodal_model:
352
353
            inputs = self._process_multimodal(
                prompt_token_ids,
354
355
                parsed_content.get("multi_modal_data") or {},
                parsed_content.get("mm_processor_kwargs") or {},
356
                tokenization_kwargs=tokenization_kwargs,
357
                mm_uuids=mm_uuids,
358
            )
359
        else:
360
            if parsed_content.get("multi_modal_data"):
361
                raise ValueError("This model does not support multimodal inputs")
362

363
            inputs = token_inputs(prompt_token_ids)
364
365
366
367
368
369
370
371
372
373

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs

    def _process_text(
        self,
        parsed_content: TextPrompt,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
374
        *,
375
        mm_uuids: Optional[MultiModalUUIDDict] = None,
376
377
378
379
    ) -> Union[TokenInputs, MultiModalInputs]:
        prompt_text = parsed_content["prompt"]

        inputs: Union[TokenInputs, MultiModalInputs]
380
        if self.model_config.is_multimodal_model:
381
382
            inputs = self._process_multimodal(
                prompt_text,
383
384
                parsed_content.get("multi_modal_data") or {},
                parsed_content.get("mm_processor_kwargs") or {},
385
                tokenization_kwargs=tokenization_kwargs,
386
                mm_uuids=mm_uuids,
387
388
            )
        else:
389
            if parsed_content.get("multi_modal_data"):
390
                raise ValueError("This model does not support multimodal inputs")
391

392
393
394
395
            prompt_token_ids = self._tokenize_prompt(
                prompt_text,
                tokenization_kwargs=tokenization_kwargs,
            )
396
            inputs = token_inputs(prompt_token_ids)
397
398
399
400
401

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs
402

403
    def _prompt_to_llm_inputs(
404
        self,
405
        prompt: SingletonPrompt,
406
        tokenization_kwargs: Optional[dict[str, Any]] = None,
407
        *,
408
        mm_uuids: Optional[MultiModalUUIDDict] = None,
409
    ) -> SingletonInputs:
410
411
        """
        Extract the singleton inputs from a prompt.
412
413
414

        Arguments:

415
        * prompt: single encoder or decoder input prompt
416
417
418

        Returns:

419
        * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
420
        """
421
        parsed = parse_singleton_prompt(prompt)
422
423

        if parsed["type"] == "embeds":
424
425
426
427
            return self._process_embeds(parsed["content"])
        if parsed["type"] == "tokens":
            return self._process_tokens(
                parsed["content"],
428
                mm_uuids=mm_uuids,
429
            )
430
431
432
433
        if parsed["type"] == "text":
            return self._process_text(
                parsed["content"],
                tokenization_kwargs=tokenization_kwargs,
434
                mm_uuids=mm_uuids,
435
436
437
438
            )
        if parsed["type"] == "str":
            return self._process_text(
                TextPrompt(prompt=parsed["content"]),
439
                tokenization_kwargs=tokenization_kwargs,
440
                mm_uuids=mm_uuids,
441
            )
442

443
444
        assert_never(parsed)

445
446
    def _build_enc_dec_llm_inputs(
        self,
447
448
        encoder_inputs: SingletonInputs,
        decoder_inputs: Optional[SingletonInputs],
449
    ) -> EncoderDecoderInputs:
450
451
452
453
454
455
456
457
        if (
            encoder_inputs["type"] == "embeds"
            or decoder_inputs
            and decoder_inputs["type"] == "embeds"
        ):
            raise ValueError(
                "Embedding inputs are not supported for encoder-decoder models"
            )
458

459
        # Needed for mypy
460
461
462
463
        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs], encoder_inputs)
        decoder_inputs = cast(
            Optional[Union[TokenInputs, MultiModalInputs]], decoder_inputs
        )
464

465
        if decoder_inputs is None:
466
467
468
469
470
471
472
            if self.model_config.hf_config.model_type == "whisper":
                # For Whisper models, the text prompt should go to the decoder.
                # If no explicit encoder/decoder inputs, then copy the prompt
                # from the encoder to the decoder. The encoder tokens are later
                # overridden by the audio features.
                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
            else:
473
                dec_token_ids = self._prepare_decoder_input_ids_for_generation(None)
474
            decoder_inputs = token_inputs(dec_token_ids)
475
        else:
476
            if "multi_modal_data" in decoder_inputs:
477
478
479
480
                raise ValueError(
                    "Multi-modal decoder inputs of encoder-"
                    "decoder models are not supported yet"
                )
481
482

            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
483
484
                decoder_inputs["prompt_token_ids"]
            )
485
            decoder_inputs["prompt_token_ids"] = dec_token_ids
486

487
        return EncoderDecoderInputs(
488
489
            encoder=encoder_inputs,
            decoder=decoder_inputs,
490
491
        )

492
    def _split_enc_dec_mm_inputs(
493
        self,
494
495
        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
        decoder_inputs_to_override: Optional[SingletonInputs] = None,
496
    ) -> tuple[SingletonInputs, SingletonInputs]:
497
498
499
500
        """
        For encoder/decoder models only:
        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
        """
501
502
503
504
505
506
507
508
        if (
            inputs["type"] == "embeds"
            or decoder_inputs_to_override
            and decoder_inputs_to_override["type"] == "embeds"
        ):
            raise ValueError(
                "Embedding inputs are not supported for encoder-decoder models"
            )
509
510
511
512
513
514
515
516
517
518
519

        # Needed for mypy
        inputs = cast(
            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
            inputs,
        )
        decoder_inputs_to_override = cast(
            Optional[Union[TokenInputs, MultiModalInputs]],
            decoder_inputs_to_override,
        )

520
521
        encoder_inputs: SingletonInputs
        decoder_inputs: SingletonInputs
522
523

        if inputs["type"] == "multimodal":  # Multimodal data inputs
524
            if "encoder_prompt_token_ids" not in inputs:
525
526
527
528
529
                raise RuntimeError(
                    "You should register an encoder-decoder "
                    "multi-modal processor for encoder-decoder "
                    "models."
                )
530
            inputs = cast(MultiModalEncDecInputs, inputs)
531

532
            encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
533

534
535
536
537
538
539
540
541
542
            decoder_prompt_inputs = decoder_inputs_to_override or inputs
            decoder_inputs = MultiModalInputs(
                type="multimodal",
                prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
                mm_kwargs=inputs["mm_kwargs"],
                mm_hashes=inputs["mm_hashes"],
                mm_placeholders=inputs["mm_placeholders"],
            )
            if cache_salt := inputs.get("cache_salt"):
543
544
                decoder_inputs["cache_salt"] = cache_salt

545
        elif inputs["type"] == "token":  # Text-only inputs
546
            encoder_inputs = token_inputs(prompt_token_ids=[])
547
548
549
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
550

551
552
        return encoder_inputs, decoder_inputs

553
554
    def _process_encoder_decoder_prompt(
        self,
555
        prompt: PromptType,
556
        tokenization_kwargs: Optional[dict[str, Any]] = None,
557
        *,
558
        mm_uuids: Optional[MultiModalUUIDDict] = None,
559
    ) -> EncoderDecoderInputs:
560
        """
561
        For encoder/decoder models only:
562
563
564
        Process an input prompt into an
        [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
        instance.
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582

        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.

        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids

        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
583

584
585
        Arguments:

586
        * prompt: an input prompt
587
588
589

        Returns:

590
591
        * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
          instance
592
        """
593
594
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
595

596
        if is_explicit_encoder_decoder_prompt(prompt):
597
598
            # `cast` is needed for mypy, but not pyright
            prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
599
            encoder_inputs = self._prompt_to_llm_inputs(
600
                prompt_["encoder_prompt"],
601
                tokenization_kwargs=tokenization_kwargs,
602
                mm_uuids=mm_uuids,
603
            )
604
            if (decoder_input := prompt_["decoder_prompt"]) is None:
605
                decoder_inputs = None
606
            else:
607
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
608
609
            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
610
            if self.model_config.is_multimodal_model:
611
612
613
                encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs(
                    encoder_inputs, decoder_inputs
                )
614
        else:
615
            # `cast` is needed for mypy, but not pyright
616
            inputs = self._prompt_to_llm_inputs(
617
                cast(SingletonPrompt, prompt),
618
                tokenization_kwargs=tokenization_kwargs,
619
                mm_uuids=mm_uuids,
620
            )
621
            if self.model_config.is_multimodal_model:
622
                # Encoder-Decoder Multimodal model
623
                encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs(inputs)
624
625
626
            else:
                encoder_inputs = inputs
                decoder_inputs = None
627
628

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
629
630
631

    def _build_decoder_only_llm_inputs(
        self,
632
        prompt_inputs: DecoderOnlyInputs,
633
    ) -> DecoderOnlyInputs:
634
        if "prompt_token_ids" in prompt_inputs:
635
636
637
            prompt_inputs = cast(
                Union[TokenInputs, MultiModalInputs], prompt_inputs
            )  # Needed for mypy
638

639
        return prompt_inputs
640
641
642

    def _process_decoder_only_prompt(
        self,
643
        prompt: SingletonPrompt,
644
        tokenization_kwargs: Optional[dict[str, Any]] = None,
645
        *,
646
        mm_uuids: Optional[MultiModalUUIDDict] = None,
647
    ) -> DecoderOnlyInputs:
648
        """
649
        For decoder-only models:
650
651
        Process an input prompt into a
        [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
652
653
654

        Arguments:

655
        * prompt: input prompt
656
657
658

        Returns:

659
        * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
660
        """
661

662
        prompt_comps = self._prompt_to_llm_inputs(
663
            prompt,
664
            tokenization_kwargs=tokenization_kwargs,
665
            mm_uuids=mm_uuids,
666
667
        )

668
        return self._build_decoder_only_llm_inputs(prompt_comps)
669

670
    def _preprocess(
671
        self,
672
        prompt: PromptType,
673
        tokenization_kwargs: Optional[dict[str, Any]] = None,
674
        *,
675
        mm_uuids: Optional[MultiModalUUIDDict] = None,
676
    ) -> ProcessorInputs:
677
        if self.model_config.is_encoder_decoder:
678
            # Encoder-decoder model requires special mapping of
679
            # input prompts to encoder & decoder.
680
            return self._process_encoder_decoder_prompt(
681
682
                prompt,
                tokenization_kwargs,
683
                mm_uuids=mm_uuids,
684
            )
685

686
        if is_explicit_encoder_decoder_prompt(prompt):
687
688
689
            raise ValueError(
                "Cannot pass encoder-decoder prompt to decoder-only models"
            )
690
691

        # Decoder-only operation
692
        # `cast` is needed for mypy, but not pyright
693
        return self._process_decoder_only_prompt(
694
            cast(SingletonPrompt, prompt),
695
            tokenization_kwargs=tokenization_kwargs,
696
            mm_uuids=mm_uuids,
697
698
        )

699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
    def preprocess(
        self,
        prompt: PromptType,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        *,
        mm_uuids: Optional[MultiModalUUIDDict] = None,
    ) -> ProcessorInputs:
        """Preprocess the input prompt."""
        res = self._preprocess(
            prompt,
            tokenization_kwargs,
            mm_uuids=mm_uuids,
        )

        if self.mm_processor_cache and self.mm_cache_stats is not None:
            delta = self.mm_processor_cache.make_stats(delta=True)
            self.mm_cache_stats.requests += 1
            self.mm_cache_stats.queries += delta.total
            self.mm_cache_stats.hits += delta.hits

        return res

    def stat_mm_cache(self) -> Optional[MultiModalCacheStats]:
        mm_cache_stats = self.mm_cache_stats
        if mm_cache_stats is None:
            return None

        self.mm_cache_stats = MultiModalCacheStats()

        return mm_cache_stats

    def clear_mm_cache(self) -> None:
731
732
        if self.mm_processor_cache is not None:
            self.mm_processor_cache.clear_cache()
733
734
735

        if self.mm_cache_stats is not None:
            self.mm_cache_stats.reset = True