preprocess.py 29 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import asyncio
4
from collections.abc import Mapping
5
from typing import Any, Optional, Union, cast
6
7
8
9
10
11

from typing_extensions import assert_never

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
12
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
13
14
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                    MultiModalInputs)
15
from vllm.prompt_adapter.request import PromptAdapterRequest
16
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
17

18
19
from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
20
21
22
23
24
25
26
27
28
29
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt

logger = init_logger(__name__)


class InputPreprocessor:

    def __init__(
        self,
        model_config: ModelConfig,
30
        tokenizer: Optional[TokenizerGroup],
31
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
32
33
34
35
36
    ) -> None:
        super().__init__()

        self.model_config = model_config
        self.tokenizer = tokenizer
37
        self.mm_registry = mm_registry
38

39
    def get_tokenizer_group(self) -> TokenizerGroup:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
        if self.tokenizer is None:
            raise ValueError("You cannot pass text prompts when "
                             "`skip_tokenizer_init` is True")

        return self.tokenizer

    def get_bos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id

    def get_eos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id

    def get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''

73
        if not self.model_config.is_encoder_decoder:
74
75
76
            logger.warning_once(
                "Using None for decoder start token id because "
                "this is not an encoder/decoder model.")
77
78
79
            return None

        if (self.model_config is None or self.model_config.hf_config is None):
80
81
82
            logger.warning_once(
                "Using None for decoder start token id because "
                "model config is not available.")
83
84
85
86
87
            return None

        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
88
89
90
91
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
                "id because decoder start token id is not "
                "available.")
92
93
94
95
            dec_start_token_id = self.get_bos_token_id()

        return dec_start_token_id

96
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.

        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.

        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.

        However, it is possible that in the future
118
        other models may have different or more
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.

        Returns:

        * prompt_token_ids
        '''

        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]

    def _prepare_decoder_input_ids_for_generation(
        self,
134
135
        decoder_input_ids: Optional[list[int]],
    ) -> list[int]:
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.

        Based on

        https://github.com/huggingface/transformers/blob/
        4037a2b5b1278736e566aec12e169100275545ea/
        src/transformers/generation/utils.py

        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()

        Arguments:

        * decoder_input_ids: input token ids to preprocess

        Returns:

        * Processed token list
        """

        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None

        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()

164
165
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
166
167
168
169
170
171
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids

        return decoder_input_ids

    def _apply_prompt_adapter(
        self,
172
        prompt_token_ids: list[int],
173
        prompt_adapter_request: Optional[PromptAdapterRequest],
174
    ) -> list[int]:
175
176
177
178
179
180
181
182
183
184
185
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)

        return prompt_token_ids

    def _tokenize_prompt(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
186
        tokenization_kwargs: Optional[dict[str, Any]] = None,
187
    ) -> list[int]:
188
189
190
191
192
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
        tokenizer = self.get_tokenizer_group()
193
194
195
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

196
197
198
199
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
200
            tokenization_kwargs["add_special_tokens"] = False
201
202
203
204
205
206

        if (self.model_config.encoder_config is not None
                and self.model_config.encoder_config.get(
                    "do_lower_case", False)):
            prompt = prompt.lower()

207
        return tokenizer.encode(prompt=prompt,
208
                                lora_request=lora_request,
209
                                **tokenization_kwargs)
210
211
212
213
214

    async def _tokenize_prompt_async(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
215
        tokenization_kwargs: Optional[dict[str, Any]] = None,
216
    ) -> list[int]:
217
218
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group()
219
220
221
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

222
223
224
225
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
226
227
228
229
            tokenization_kwargs["add_special_tokens"] = False
        return await tokenizer.encode_async(prompt=prompt,
                                            lora_request=lora_request,
                                            **tokenization_kwargs)
230

231
232
    def _process_multimodal(
        self,
233
        prompt: Union[str, list[int]],
234
235
236
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
237
        return_mm_hashes: bool = False,
238
    ) -> MultiModalInputs:
239
240
241
242
        """
        Apply the model's multi-modal processor to a multi-modal prompt,
        returning the corresponding token IDs and metadata.
        """
243
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
244
        # initialized without a tokenizer while using also multi-modal input
245
        if not self.tokenizer:
246
            tokenizer = object()  # Dummy
247
248
249
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
250

251
252
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
253
254
255
256

        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

257
258
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
259
260
261

    async def _process_multimodal_async(
        self,
262
        prompt: Union[str, list[int]],
263
264
265
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
266
        return_mm_hashes: bool = False,
267
    ) -> MultiModalInputs:
268
        """Async version of :meth:`_process_multimodal`."""
269
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
270
        # initialized without a tokenizer while using also multi-modal input
271
        if not self.tokenizer:
272
            tokenizer = object()  # Dummy
273
274
275
276
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                lora_request)
277

278
279
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
280
281
282
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

283
284
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
285

286
    def _prompt_to_llm_inputs(
287
        self,
288
        prompt: SingletonPrompt,
289
        tokenization_kwargs: Optional[dict[str, Any]] = None,
290
        lora_request: Optional[LoRARequest] = None,
291
        return_mm_hashes: bool = False,
292
    ) -> SingletonInputs:
293
294
        """
        Extract the singleton inputs from a prompt.
295
296
297

        Arguments:

298
        * prompt: single encoder or decoder input prompt
299
        * lora_request: this is only valid for decoder prompts
300
        * return_mm_hashes: whether to return multimodal hashes
301
302
303

        Returns:

304
305
        * :class:`SingletonInputs` instance
        """
306
        parsed = parse_singleton_prompt(prompt)
307
308

        if parsed["type"] == "str":
309
            prompt_text = parsed["content"]
310
            prompt_token_ids = self._tokenize_prompt(
311
                prompt_text,
312
                lora_request=lora_request,
313
                tokenization_kwargs=tokenization_kwargs,
314
            )
315
316
317
318
319
320
321
322
323
324

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if parsed["type"] == "tokens":
            tokens_content = parsed["content"]

            prompt_token_ids = tokens_content["prompt_token_ids"]
325
            token_type_ids = tokens_content.get("token_type_ids")
326
327
328
            multi_modal_data = tokens_content.get("multi_modal_data")
            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")

329
            if multi_modal_data is not None:
330
331
332
333
334
                return self._process_multimodal(
                    prompt_token_ids,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
335
                    return_mm_hashes=return_mm_hashes,
336
337
                )

338
339
            return token_inputs(
                prompt_token_ids=prompt_token_ids,
340
                token_type_ids=token_type_ids,
341
342
343
344
345
346
            )

        if parsed["type"] == "text":
            text_content = parsed["content"]

            prompt_text = text_content["prompt"]
347
348
349
            multi_modal_data = text_content.get("multi_modal_data")
            mm_processor_kwargs = text_content.get("mm_processor_kwargs")

350
            if multi_modal_data is not None:
351
352
353
354
355
                return self._process_multimodal(
                    prompt_text,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
356
                    return_mm_hashes=return_mm_hashes,
357
358
                )

359
            prompt_token_ids = self._tokenize_prompt(
360
                prompt_text,
361
                lora_request=lora_request,
362
                tokenization_kwargs=tokenization_kwargs,
363
            )
364
365
366
367
368

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )
369

370
        assert_never(parsed)
371

372
    async def _prompt_to_llm_inputs_async(
373
        self,
374
        prompt: SingletonPrompt,
375
        tokenization_kwargs: Optional[dict[str, Any]] = None,
376
        lora_request: Optional[LoRARequest] = None,
377
        return_mm_hashes: bool = False,
378
    ) -> SingletonInputs:
379
        """Async version of :meth:`_extract_prompt_components`."""
380
        parsed = parse_singleton_prompt(prompt)
381
382

        if parsed["type"] == "str":
383
            prompt_text = parsed["content"]
384
            prompt_token_ids = await self._tokenize_prompt_async(
385
                prompt_text,
386
                lora_request=lora_request,
387
                tokenization_kwargs=tokenization_kwargs,
388
            )
389
390
391
392
393
394
395
396
397
398
399
400
401

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if parsed["type"] == "tokens":
            tokens_content = parsed["content"]

            prompt_token_ids = tokens_content["prompt_token_ids"]
            multi_modal_data = tokens_content.get("multi_modal_data")
            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")

402
            if multi_modal_data is not None:
403
404
405
406
407
                return await self._process_multimodal_async(
                    prompt_token_ids,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
408
                    return_mm_hashes=return_mm_hashes,
409
410
                )

411
            return token_inputs(prompt_token_ids=prompt_token_ids)
412
413
414
415
416

        if parsed["type"] == "text":
            text_content = parsed["content"]

            prompt_text = text_content["prompt"]
417
418
419
            multi_modal_data = text_content.get("multi_modal_data")
            mm_processor_kwargs = text_content.get("mm_processor_kwargs")

420
            if multi_modal_data is not None:
421
422
423
424
425
                return await self._process_multimodal_async(
                    prompt_text,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
426
                    return_mm_hashes=return_mm_hashes,
427
428
                )

429
            prompt_token_ids = await self._tokenize_prompt_async(
430
                prompt_text,
431
432
                lora_request=lora_request,
            )
433
434
435
436
437

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )
438

439
        assert_never(parsed)
440
441
442

    def _build_enc_dec_llm_inputs(
        self,
443
444
        encoder_inputs: SingletonInputs,
        decoder_inputs: Optional[SingletonInputs],
445
    ) -> EncoderDecoderInputs:
446
447
        if (encoder_inputs["type"] == "token"
                or encoder_inputs["type"] == "multimodal"):
448
449
            pass
        else:
450
            assert_never(encoder_inputs)  # type: ignore[arg-type]
451
452

        if decoder_inputs is None:
453
454
455
456
457
458
459
460
461
            if self.model_config.hf_config.model_type == "whisper":
                # For Whisper models, the text prompt should go to the decoder.
                # If no explicit encoder/decoder inputs, then copy the prompt
                # from the encoder to the decoder. The encoder tokens are later
                # overridden by the audio features.
                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
            else:
                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                    None)
462
            decoder_inputs = token_inputs(dec_token_ids)
463
464
        elif (decoder_inputs["type"] == "token"
              or decoder_inputs["type"] == "multimodal"):
465
466
467
468
469
470
471
472
            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                decoder_inputs["prompt_token_ids"])
            decoder_inputs["prompt_token_ids"] = dec_token_ids

            if "multi_modal_data" in decoder_inputs:
                raise ValueError("Multi-modal decoder inputs of encoder-"
                                 "decoder models are not supported yet")
        else:
473
            assert_never(encoder_inputs)  # type: ignore[arg-type]
474

475
        return EncoderDecoderInputs(
476
477
            encoder=encoder_inputs,
            decoder=decoder_inputs,
478
479
        )

480
481
482
483
    def _separate_enc_dec_inputs_from_mm_processor_outputs(
        self,
        inputs: SingletonInputs,
        decoder_inputs_to_override: Optional[SingletonInputs] = None,
484
    ) -> tuple[SingletonInputs, SingletonInputs]:
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
        """
        For encoder/decoder models only:
        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
        """
        encoder_inputs: SingletonInputs
        decoder_inputs: SingletonInputs
        if inputs["type"] == "multimodal":
            # Multimodal data inputs
            assert ("encoder_prompt" in inputs
                    and "encoder_prompt_token_ids" in inputs)
            inputs = cast(MultiModalEncDecInputs, inputs)
            encoder_inputs = token_inputs(
                prompt=inputs["encoder_prompt"],
                prompt_token_ids=inputs["encoder_prompt_token_ids"],
            )
            if decoder_inputs_to_override is not None:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=decoder_inputs_to_override.get("prompt", ""),
                    prompt_token_ids=decoder_inputs_to_override[
                        "prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
507
                    mm_hashes=inputs["mm_hashes"],
508
509
510
511
512
513
514
515
                    mm_placeholders=inputs["mm_placeholders"],
                )
            else:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=inputs["prompt"],
                    prompt_token_ids=inputs["prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
516
                    mm_hashes=inputs["mm_hashes"],
517
518
519
520
521
522
523
524
525
526
                    mm_placeholders=inputs["mm_placeholders"],
                )
        elif inputs["type"] == "token":
            # Text-only inputs
            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
        return encoder_inputs, decoder_inputs

527
528
    def _process_encoder_decoder_prompt(
        self,
529
        prompt: PromptType,
530
        tokenization_kwargs: Optional[dict[str, Any]] = None,
531
    ) -> EncoderDecoderInputs:
532
        """
533
        For encoder/decoder models only:
534
        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552

        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.

        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids

        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
553

554
555
        Arguments:

556
        * prompt: an input prompt
557
558
559

        Returns:

560
        * :class:`EncoderDecoderInputs` instance
561
        """
562
563
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
564

565
        if is_explicit_encoder_decoder_prompt(prompt):
566
            encoder_inputs = self._prompt_to_llm_inputs(
567
568
569
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
570
            if (decoder_input := prompt["decoder_prompt"]) is None:
571
                decoder_inputs = None
572
            else:
573
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
574
575
            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
576
            if self.model_config.is_multimodal_model:
577
578
579
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
580
        else:
581
582
583
584
            inputs = self._prompt_to_llm_inputs(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
585
            if self.model_config.is_multimodal_model:
586
587
588
589
590
591
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
592

593
                decoder_inputs = None
594
595

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
596
597
598

    async def _process_encoder_decoder_prompt_async(
        self,
599
        prompt: PromptType,
600
        tokenization_kwargs: Optional[dict[str, Any]] = None,
601
    ) -> EncoderDecoderInputs:
602
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
603
604
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
605

606
        if is_explicit_encoder_decoder_prompt(prompt):
607
            encoder_task = self._prompt_to_llm_inputs_async(
608
609
610
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
611

612
            if (decoder_input := prompt["decoder_prompt"]) is None:
613
614
                encoder_inputs = await encoder_task
                decoder_inputs = None
615
            else:
616
617
618
619
                decoder_task = self._prompt_to_llm_inputs_async(
                    decoder_input,
                    tokenization_kwargs=tokenization_kwargs,
                )
620

621
                encoder_inputs, decoder_inputs = await asyncio.gather(
622
                    encoder_task, decoder_task)
623
624
625

            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
626
            if self.model_config.is_multimodal_model:
627
628
629
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
630
        else:
631
632
633
634
            inputs = await self._prompt_to_llm_inputs_async(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
635
            if self.model_config.is_multimodal_model:
636
637
638
639
640
641
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
642

643
                decoder_inputs = None
644
645

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
646
647
648

    def _build_decoder_only_llm_inputs(
        self,
649
        prompt_inputs: DecoderOnlyInputs,
650
        prompt_adapter_request: Optional[PromptAdapterRequest],
651
    ) -> DecoderOnlyInputs:
652
653
        if (prompt_inputs["type"] == "token"
                or prompt_inputs["type"] == "multimodal"):
654
655
656
657
658
            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                prompt_inputs["prompt_token_ids"],
                prompt_adapter_request=prompt_adapter_request,
            )
        else:
659
            assert_never(prompt_inputs)  # type: ignore[arg-type]
660

661
        return prompt_inputs
662
663
664

    def _process_decoder_only_prompt(
        self,
665
        prompt: SingletonPrompt,
666
        tokenization_kwargs: Optional[dict[str, Any]] = None,
667
668
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
669
        return_mm_hashes: bool = False,
670
    ) -> DecoderOnlyInputs:
671
        """
672
        For decoder-only models:
673
        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
674
675
676

        Arguments:

677
        * prompt: input prompt
678
679
        * lora_request
        * prompt_adapter_request
680
        * return_mm_hashes
681
682
683

        Returns:

684
        * :class:`DecoderOnlyInputs` instance
685
        """
686

687
        prompt_comps = self._prompt_to_llm_inputs(
688
            prompt,
689
            tokenization_kwargs=tokenization_kwargs,
690
            lora_request=lora_request,
691
            return_mm_hashes=return_mm_hashes,
692
693
694
695
696
697
698
699
700
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    async def _process_decoder_only_prompt_async(
        self,
701
        prompt: SingletonPrompt,
702
        tokenization_kwargs: Optional[dict[str, Any]] = None,
703
704
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
705
        return_mm_hashes: bool = False,
706
    ) -> DecoderOnlyInputs:
707
        """Async version of :meth:`_process_decoder_only_prompt`."""
708
        prompt_comps = await self._prompt_to_llm_inputs_async(
709
            prompt,
710
            tokenization_kwargs=tokenization_kwargs,
711
            lora_request=lora_request,
712
            return_mm_hashes=return_mm_hashes,
713
714
715
716
717
718
719
720
721
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    def preprocess(
        self,
722
        prompt: PromptType,
723
        tokenization_kwargs: Optional[dict[str, Any]] = None,
724
725
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
726
        return_mm_hashes: bool = False,
727
    ) -> ProcessorInputs:
728
        """Preprocess the input prompt."""
729
        if self.model_config.is_encoder_decoder:
730
731
732
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
733
734
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
735
            return self._process_encoder_decoder_prompt(prompt)
736

737
        if is_explicit_encoder_decoder_prompt(prompt):
738
739
740
741
742
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return self._process_decoder_only_prompt(
743
            prompt,
744
            tokenization_kwargs=tokenization_kwargs,
745
746
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
747
            return_mm_hashes=return_mm_hashes,
748
749
750
751
        )

    async def preprocess_async(
        self,
752
        prompt: PromptType,
753
        tokenization_kwargs: Optional[dict[str, Any]] = None,
754
755
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
756
        return_mm_hashes: bool = False,
757
    ) -> ProcessorInputs:
758
        """Async version of :meth:`preprocess`."""
759
        if self.model_config.is_encoder_decoder:
760
761
762
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
763
764
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
765
            return await self._process_encoder_decoder_prompt_async(prompt)
766

767
        if is_explicit_encoder_decoder_prompt(prompt):
768
769
770
771
772
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return await self._process_decoder_only_prompt_async(
773
            prompt,
774
            tokenization_kwargs=tokenization_kwargs,
775
776
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
777
            return_mm_hashes=return_mm_hashes,
778
        )