preprocess.py 33 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import asyncio
4
from collections.abc import Mapping
5
from typing import Any, Optional, Union, cast
6
7
8

from typing_extensions import assert_never

9
from vllm import envs
10
11
12
from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
13
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
14
15
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                    MultiModalInputs)
16
from vllm.prompt_adapter.request import PromptAdapterRequest
17
from vllm.transformers_utils.tokenizer import AnyTokenizer
18
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
19

20
21
22
23
24
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
                   EncoderDecoderInputs, ProcessorInputs, PromptType,
                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
                   TokensPrompt, embeds_inputs, token_inputs)
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
25
26
27
28
29
30
31
32
33

logger = init_logger(__name__)


class InputPreprocessor:

    def __init__(
        self,
        model_config: ModelConfig,
34
        tokenizer: Optional[TokenizerGroup],
35
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
36
37
38
39
40
    ) -> None:
        super().__init__()

        self.model_config = model_config
        self.tokenizer = tokenizer
41
        self.mm_registry = mm_registry
42

43
    def get_tokenizer_group(self) -> TokenizerGroup:
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
        if self.tokenizer is None:
            raise ValueError("You cannot pass text prompts when "
                             "`skip_tokenizer_init` is True")

        return self.tokenizer

    def get_bos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id

    def get_eos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id

    def get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''

77
        if not self.model_config.is_encoder_decoder:
78
79
80
            logger.warning_once(
                "Using None for decoder start token id because "
                "this is not an encoder/decoder model.")
81
82
83
            return None

        if (self.model_config is None or self.model_config.hf_config is None):
84
85
86
            logger.warning_once(
                "Using None for decoder start token id because "
                "model config is not available.")
87
88
89
90
91
            return None

        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
92
93
94
95
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
                "id because decoder start token id is not "
                "available.")
96
97
98
99
            dec_start_token_id = self.get_bos_token_id()

        return dec_start_token_id

100
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.

        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.

        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.

        However, it is possible that in the future
122
        other models may have different or more
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.

        Returns:

        * prompt_token_ids
        '''

        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]

    def _prepare_decoder_input_ids_for_generation(
        self,
138
139
        decoder_input_ids: Optional[list[int]],
    ) -> list[int]:
140
141
142
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.

143
144
145
146
        Based on:
        https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py
        specifically,
        `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

        Arguments:

        * decoder_input_ids: input token ids to preprocess

        Returns:

        * Processed token list
        """

        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None

        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()

165
166
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
167
168
169
170
171
172
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids

        return decoder_input_ids

    def _apply_prompt_adapter(
        self,
173
        prompt_token_ids: list[int],
174
        prompt_adapter_request: Optional[PromptAdapterRequest],
175
    ) -> list[int]:
176
177
178
179
180
181
182
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)

        return prompt_token_ids

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
    def _get_tokenization_kw(
        self,
        overrides: Optional[dict[str, Any]] = None,
    ) -> dict[str, Any]:
        kwargs = dict[str, Any]()

        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
            kwargs["add_special_tokens"] = False

        if overrides:
            kwargs.update(overrides)

        return kwargs

200
201
202
203
    def _tokenize_prompt(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
204
        tokenization_kwargs: Optional[dict[str, Any]] = None,
205
    ) -> list[int]:
206
207
208
209
210
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
        tokenizer = self.get_tokenizer_group()
211
        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
212

213
        encoder_config = self.model_config.encoder_config
214

215
        if encoder_config and encoder_config.get("do_lower_case", False):
216
217
            prompt = prompt.lower()

218
        return tokenizer.encode(prompt=prompt,
219
                                lora_request=lora_request,
220
                                **tokenization_kwargs)
221
222
223
224
225

    async def _tokenize_prompt_async(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
226
        tokenization_kwargs: Optional[dict[str, Any]] = None,
227
    ) -> list[int]:
228
229
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group()
230
        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
231
232
233
234

        return await tokenizer.encode_async(prompt=prompt,
                                            lora_request=lora_request,
                                            **tokenization_kwargs)
235

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    def _get_mm_tokenizer(
        self,
        lora_request: Optional[LoRARequest],
    ) -> AnyTokenizer:
        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
        # while using also multi-modal input
        if not self.tokenizer:
            return cast(AnyTokenizer, object())  # Dummy

        tokenizer_group = self.get_tokenizer_group()
        return tokenizer_group.get_lora_tokenizer(lora_request)

    async def _get_mm_tokenizer_async(
        self,
        lora_request: Optional[LoRARequest],
    ) -> AnyTokenizer:
        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
        # while using also multi-modal input
        if not self.tokenizer:
            return cast(AnyTokenizer, object())  # Dummy

        tokenizer_group = self.get_tokenizer_group()
        return await tokenizer_group.get_lora_tokenizer_async(lora_request)

260
261
    def _process_multimodal(
        self,
262
        prompt: Union[str, list[int]],
263
264
265
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
266
        return_mm_hashes: bool = False,
267
    ) -> MultiModalInputs:
268
269
270
271
        """
        Apply the model's multi-modal processor to a multi-modal prompt,
        returning the corresponding token IDs and metadata.
        """
272
        tokenizer = self._get_mm_tokenizer(lora_request)
273

274
275
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
276
277
278
279

        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

280
281
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
282
283
284

    async def _process_multimodal_async(
        self,
285
        prompt: Union[str, list[int]],
286
287
288
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
289
        return_mm_hashes: bool = False,
290
    ) -> MultiModalInputs:
291
        """Async version of :meth:`_process_multimodal`."""
292
        tokenizer = await self._get_mm_tokenizer_async(lora_request)
293

294
295
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
296
297
298
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

299
300
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
301

302
303
304
305
    def _process_embeds(
        self,
        parsed_content: EmbedsPrompt,
    ) -> EmbedsInputs:
306
307
308
        if not self.model_config.enable_prompt_embeds:
            raise ValueError("You must set `--enable-prompt-embeds` to input "
                             "`prompt_embeds`.")
309
        if envs.VLLM_USE_V1:
310
            raise ValueError("`prompt_embeds` is only available in V0.")
311
312

        prompt_embeds = parsed_content["prompt_embeds"]
313

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        # prompt_embeds must be (seq_len, hidden_size), but if the user
        # passes in a batch of size 1, i.e. (1, seq_len, hidden_size),
        # we can unambiguously process the intent by squeezing the batch
        # dimension.
        if prompt_embeds.ndim == 3:
            prompt_embeds = prompt_embeds.squeeze(dim=0)

        if prompt_embeds.ndim != 2:
            raise ValueError(
                "prompt_embeds must be of shape (seq_len, hidden_size).")

        return embeds_inputs(prompt_embeds=prompt_embeds,
                             cache_salt=parsed_content.get("cache_salt"))

    async def _process_embeds_async(
        self,
        parsed_content: EmbedsPrompt,
    ) -> EmbedsInputs:
        return self._process_embeds(parsed_content)

    def _process_tokens(
        self,
        parsed_content: TokensPrompt,
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> Union[TokenInputs, MultiModalInputs]:
        prompt_token_ids = parsed_content["prompt_token_ids"]
        token_type_ids = parsed_content.get("token_type_ids")

        inputs: Union[TokenInputs, MultiModalInputs]
        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = self._process_multimodal(
                prompt_token_ids,
                multi_modal_data,
                parsed_content.get("mm_processor_kwargs"),
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
            )
352
        else:
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
            inputs = token_inputs(
                prompt_token_ids=prompt_token_ids,
                token_type_ids=token_type_ids,
            )

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs

    async def _process_tokens_async(
        self,
        parsed_content: TokensPrompt,
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> Union[TokenInputs, MultiModalInputs]:
        prompt_token_ids = parsed_content["prompt_token_ids"]
        token_type_ids = parsed_content.get("token_type_ids")

        inputs: Union[TokenInputs, MultiModalInputs]
        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = await self._process_multimodal_async(
                prompt_token_ids,
                multi_modal_data,
                parsed_content.get("mm_processor_kwargs"),
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
            )
        else:
            inputs = token_inputs(
                prompt_token_ids=prompt_token_ids,
                token_type_ids=token_type_ids,
            )

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs

    def _process_text(
        self,
        parsed_content: TextPrompt,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> Union[TokenInputs, MultiModalInputs]:
        prompt_text = parsed_content["prompt"]

        inputs: Union[TokenInputs, MultiModalInputs]
        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = self._process_multimodal(
                prompt_text,
                multi_modal_data,
                parsed_content.get("mm_processor_kwargs"),
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
            )
        else:
            prompt_token_ids = self._tokenize_prompt(
                prompt_text,
                lora_request=lora_request,
                tokenization_kwargs=tokenization_kwargs,
            )
            inputs = token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs
425

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
    async def _process_text_async(
        self,
        parsed_content: TextPrompt,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> Union[TokenInputs, MultiModalInputs]:
        prompt_text = parsed_content["prompt"]

        inputs: Union[TokenInputs, MultiModalInputs]
        if multi_modal_data := parsed_content.get("multi_modal_data"):
            inputs = await self._process_multimodal_async(
                prompt_text,
                multi_modal_data,
                parsed_content.get("mm_processor_kwargs"),
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
            )
        else:
            prompt_token_ids = await self._tokenize_prompt_async(
                prompt_text,
                lora_request=lora_request,
                tokenization_kwargs=tokenization_kwargs,
            )
            inputs = token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if cache_salt := parsed_content.get("cache_salt"):
            inputs["cache_salt"] = cache_salt

        return inputs
459

460
    def _prompt_to_llm_inputs(
461
        self,
462
        prompt: SingletonPrompt,
463
        tokenization_kwargs: Optional[dict[str, Any]] = None,
464
        lora_request: Optional[LoRARequest] = None,
465
        return_mm_hashes: bool = False,
466
    ) -> SingletonInputs:
467
468
        """
        Extract the singleton inputs from a prompt.
469
470
471

        Arguments:

472
        * prompt: single encoder or decoder input prompt
473
        * lora_request: this is only valid for decoder prompts
474
        * return_mm_hashes: whether to return multimodal hashes
475
476
477

        Returns:

478
479
        * :class:`SingletonInputs` instance
        """
480
        parsed = parse_singleton_prompt(prompt)
481
482

        if parsed["type"] == "embeds":
483
484
485
486
            return self._process_embeds(parsed["content"])
        if parsed["type"] == "tokens":
            return self._process_tokens(
                parsed["content"],
487
                lora_request=lora_request,
488
                return_mm_hashes=return_mm_hashes,
489
            )
490
491
492
493
        if parsed["type"] == "text":
            return self._process_text(
                parsed["content"],
                tokenization_kwargs=tokenization_kwargs,
494
                lora_request=lora_request,
495
496
497
498
499
                return_mm_hashes=return_mm_hashes,
            )
        if parsed["type"] == "str":
            return self._process_text(
                TextPrompt(prompt=parsed["content"]),
500
                tokenization_kwargs=tokenization_kwargs,
501
502
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
503
            )
504

505
506
        assert_never(parsed)

507
    async def _prompt_to_llm_inputs_async(
508
        self,
509
        prompt: SingletonPrompt,
510
        tokenization_kwargs: Optional[dict[str, Any]] = None,
511
        lora_request: Optional[LoRARequest] = None,
512
        return_mm_hashes: bool = False,
513
    ) -> SingletonInputs:
514
        """Async version of :meth:`_prompt_to_llm_inputs`."""
515
        parsed = parse_singleton_prompt(prompt)
516

517
        if parsed["type"] == "embeds":
518
519
520
521
            return await self._process_embeds_async(parsed["content"])
        if parsed["type"] == "tokens":
            return await self._process_tokens_async(
                parsed["content"],
522
523
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
524
            )
525
526
527
528
        if parsed["type"] == "text":
            return await self._process_text_async(
                parsed["content"],
                tokenization_kwargs=tokenization_kwargs,
529
                lora_request=lora_request,
530
531
532
533
534
                return_mm_hashes=return_mm_hashes,
            )
        if parsed["type"] == "str":
            return await self._process_text_async(
                TextPrompt(prompt=parsed["content"]),
535
                tokenization_kwargs=tokenization_kwargs,
536
537
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
538
            )
539

540
541
        assert_never(parsed)

542
543
    def _build_enc_dec_llm_inputs(
        self,
544
545
        encoder_inputs: SingletonInputs,
        decoder_inputs: Optional[SingletonInputs],
546
    ) -> EncoderDecoderInputs:
547
548
549
550
        if (encoder_inputs["type"] == "embeds"
                or decoder_inputs and decoder_inputs["type"] == "embeds"):
            raise ValueError("Embedding inputs are not supported for encoder-"
                             "decoder models")
551

552
553
554
555
556
        # Needed for mypy
        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs],
                              encoder_inputs)
        decoder_inputs = cast(Optional[Union[TokenInputs, MultiModalInputs]],
                              decoder_inputs)
557

558
        if decoder_inputs is None:
559
560
561
562
563
564
565
566
567
            if self.model_config.hf_config.model_type == "whisper":
                # For Whisper models, the text prompt should go to the decoder.
                # If no explicit encoder/decoder inputs, then copy the prompt
                # from the encoder to the decoder. The encoder tokens are later
                # overridden by the audio features.
                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
            else:
                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                    None)
568
            decoder_inputs = token_inputs(dec_token_ids)
569
        else:
570
571
572
            if "multi_modal_data" in decoder_inputs:
                raise ValueError("Multi-modal decoder inputs of encoder-"
                                 "decoder models are not supported yet")
573
574
575
576

            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                decoder_inputs["prompt_token_ids"])
            decoder_inputs["prompt_token_ids"] = dec_token_ids
577

578
        return EncoderDecoderInputs(
579
580
            encoder=encoder_inputs,
            decoder=decoder_inputs,
581
582
        )

583
    def _split_enc_dec_mm_inputs(
584
        self,
585
586
        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
        decoder_inputs_to_override: Optional[SingletonInputs] = None,
587
    ) -> tuple[SingletonInputs, SingletonInputs]:
588
589
590
591
        """
        For encoder/decoder models only:
        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
        """
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
        if (inputs["type"] == "embeds" or decoder_inputs_to_override
                and decoder_inputs_to_override["type"] == "embeds"):
            raise ValueError("Embedding inputs are not supported for encoder-"
                             "decoder models")

        # Needed for mypy
        inputs = cast(
            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
            inputs,
        )
        decoder_inputs_to_override = cast(
            Optional[Union[TokenInputs, MultiModalInputs]],
            decoder_inputs_to_override,
        )

607
608
        encoder_inputs: SingletonInputs
        decoder_inputs: SingletonInputs
609
610
611
612
613
614
615

        if inputs["type"] == "multimodal":  # Multimodal data inputs
            if not ("encoder_prompt" in inputs
                    and "encoder_prompt_token_ids" in inputs):
                raise RuntimeError("You should register an encoder-decoder "
                                   "multi-modal processor for encoder-decoder "
                                   "models.")
616
            inputs = cast(MultiModalEncDecInputs, inputs)
617

618
619
620
621
            encoder_inputs = token_inputs(
                prompt=inputs["encoder_prompt"],
                prompt_token_ids=inputs["encoder_prompt_token_ids"],
            )
622

623
624
625
626
627
628
629
630
631
632
            decoder_prompt_inputs = decoder_inputs_to_override or inputs
            decoder_inputs = MultiModalInputs(
                type="multimodal",
                prompt=decoder_prompt_inputs.get("prompt", ""),
                prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
                mm_kwargs=inputs["mm_kwargs"],
                mm_hashes=inputs["mm_hashes"],
                mm_placeholders=inputs["mm_placeholders"],
            )
            if cache_salt := inputs.get("cache_salt"):
633
634
                decoder_inputs["cache_salt"] = cache_salt

635
        elif inputs["type"] == "token":  # Text-only inputs
636
637
638
639
            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
640

641
642
        return encoder_inputs, decoder_inputs

643
644
    def _process_encoder_decoder_prompt(
        self,
645
        prompt: PromptType,
646
        tokenization_kwargs: Optional[dict[str, Any]] = None,
647
    ) -> EncoderDecoderInputs:
648
        """
649
        For encoder/decoder models only:
650
        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668

        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.

        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids

        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
669

670
671
        Arguments:

672
        * prompt: an input prompt
673
674
675

        Returns:

676
        * :class:`EncoderDecoderInputs` instance
677
        """
678
679
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
680

681
        if is_explicit_encoder_decoder_prompt(prompt):
682
            encoder_inputs = self._prompt_to_llm_inputs(
683
684
685
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
686
            if (decoder_input := prompt["decoder_prompt"]) is None:
687
                decoder_inputs = None
688
            else:
689
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
690
691
            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
692
            if self.model_config.is_multimodal_model:
693
                encoder_inputs, decoder_inputs = (
694
695
                    self._split_enc_dec_mm_inputs(encoder_inputs,
                                                  decoder_inputs))
696
        else:
697
698
699
700
            inputs = self._prompt_to_llm_inputs(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
701
            if self.model_config.is_multimodal_model:
702
703
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
704
                    self._split_enc_dec_mm_inputs(inputs))
705
706
707
            else:
                encoder_inputs = inputs
                decoder_inputs = None
708
709

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
710
711
712

    async def _process_encoder_decoder_prompt_async(
        self,
713
        prompt: PromptType,
714
        tokenization_kwargs: Optional[dict[str, Any]] = None,
715
    ) -> EncoderDecoderInputs:
716
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
717
718
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
719

720
        if is_explicit_encoder_decoder_prompt(prompt):
721
            encoder_task = self._prompt_to_llm_inputs_async(
722
723
724
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
725

726
            if (decoder_input := prompt["decoder_prompt"]) is None:
727
728
                encoder_inputs = await encoder_task
                decoder_inputs = None
729
            else:
730
731
732
733
                decoder_task = self._prompt_to_llm_inputs_async(
                    decoder_input,
                    tokenization_kwargs=tokenization_kwargs,
                )
734

735
                encoder_inputs, decoder_inputs = await asyncio.gather(
736
                    encoder_task, decoder_task)
737
738
739

            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
740
            if self.model_config.is_multimodal_model:
741
                encoder_inputs, decoder_inputs = (
742
743
                    self._split_enc_dec_mm_inputs(encoder_inputs,
                                                  decoder_inputs))
744
        else:
745
746
747
748
            inputs = await self._prompt_to_llm_inputs_async(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
749
            if self.model_config.is_multimodal_model:
750
751
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
752
                    self._split_enc_dec_mm_inputs(inputs))
753
754
755
            else:
                encoder_inputs = inputs
                decoder_inputs = None
756
757

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
758
759
760

    def _build_decoder_only_llm_inputs(
        self,
761
        prompt_inputs: DecoderOnlyInputs,
762
        prompt_adapter_request: Optional[PromptAdapterRequest],
763
    ) -> DecoderOnlyInputs:
764
765
766
        if "prompt_token_ids" in prompt_inputs:
            prompt_inputs = cast(Union[TokenInputs, MultiModalInputs],
                                 prompt_inputs)  # Needed for mypy
767
768
769
770
            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                prompt_inputs["prompt_token_ids"],
                prompt_adapter_request=prompt_adapter_request,
            )
771

772
        return prompt_inputs
773
774
775

    def _process_decoder_only_prompt(
        self,
776
        prompt: SingletonPrompt,
777
        tokenization_kwargs: Optional[dict[str, Any]] = None,
778
779
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
780
        return_mm_hashes: bool = False,
781
    ) -> DecoderOnlyInputs:
782
        """
783
        For decoder-only models:
784
        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
785
786
787

        Arguments:

788
        * prompt: input prompt
789
790
        * lora_request
        * prompt_adapter_request
791
        * return_mm_hashes
792
793
794

        Returns:

795
        * :class:`DecoderOnlyInputs` instance
796
        """
797

798
        prompt_comps = self._prompt_to_llm_inputs(
799
            prompt,
800
            tokenization_kwargs=tokenization_kwargs,
801
            lora_request=lora_request,
802
            return_mm_hashes=return_mm_hashes,
803
804
805
806
807
808
809
810
811
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    async def _process_decoder_only_prompt_async(
        self,
812
        prompt: SingletonPrompt,
813
        tokenization_kwargs: Optional[dict[str, Any]] = None,
814
815
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
816
        return_mm_hashes: bool = False,
817
    ) -> DecoderOnlyInputs:
818
        """Async version of :meth:`_process_decoder_only_prompt`."""
819
        prompt_comps = await self._prompt_to_llm_inputs_async(
820
            prompt,
821
            tokenization_kwargs=tokenization_kwargs,
822
            lora_request=lora_request,
823
            return_mm_hashes=return_mm_hashes,
824
825
826
827
828
829
830
831
832
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    def preprocess(
        self,
833
        prompt: PromptType,
834
        tokenization_kwargs: Optional[dict[str, Any]] = None,
835
836
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
837
        return_mm_hashes: bool = False,
838
    ) -> ProcessorInputs:
839
        """Preprocess the input prompt."""
840
        if self.model_config.is_encoder_decoder:
841
842
843
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
844
845
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
846
            return self._process_encoder_decoder_prompt(prompt)
847

848
        if is_explicit_encoder_decoder_prompt(prompt):
849
850
851
852
853
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return self._process_decoder_only_prompt(
854
            prompt,
855
            tokenization_kwargs=tokenization_kwargs,
856
857
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
858
            return_mm_hashes=return_mm_hashes,
859
860
861
862
        )

    async def preprocess_async(
        self,
863
        prompt: PromptType,
864
        tokenization_kwargs: Optional[dict[str, Any]] = None,
865
866
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
867
        return_mm_hashes: bool = False,
868
    ) -> ProcessorInputs:
869
        """Async version of :meth:`preprocess`."""
870
        if self.model_config.is_encoder_decoder:
871
872
873
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
874
875
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
876
            return await self._process_encoder_decoder_prompt_async(prompt)
877

878
        if is_explicit_encoder_decoder_prompt(prompt):
879
880
881
882
883
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return await self._process_decoder_only_prompt_async(
884
            prompt,
885
            tokenization_kwargs=tokenization_kwargs,
886
887
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
888
            return_mm_hashes=return_mm_hashes,
889
        )