preprocess.py 27.6 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import asyncio
4
5
from collections.abc import Mapping
from typing import Optional, Union, cast
6
7
8
9
10
11

from typing_extensions import assert_never

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
12
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
13
14
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                    MultiModalInputs)
15
from vllm.prompt_adapter.request import PromptAdapterRequest
16
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
17

18
19
from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
20
21
22
23
24
25
26
27
28
29
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt

logger = init_logger(__name__)


class InputPreprocessor:

    def __init__(
        self,
        model_config: ModelConfig,
30
        tokenizer: Optional[TokenizerGroup],
31
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
32
33
34
35
36
    ) -> None:
        super().__init__()

        self.model_config = model_config
        self.tokenizer = tokenizer
37
        self.mm_registry = mm_registry
38

39
    def get_tokenizer_group(self) -> TokenizerGroup:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
        if self.tokenizer is None:
            raise ValueError("You cannot pass text prompts when "
                             "`skip_tokenizer_init` is True")

        return self.tokenizer

    def get_bos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id

    def get_eos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id

    def get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''

73
        if not self.model_config.is_encoder_decoder:
74
75
76
            logger.warning_once(
                "Using None for decoder start token id because "
                "this is not an encoder/decoder model.")
77
78
79
            return None

        if (self.model_config is None or self.model_config.hf_config is None):
80
81
82
            logger.warning_once(
                "Using None for decoder start token id because "
                "model config is not available.")
83
84
85
86
87
            return None

        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
88
89
90
91
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
                "id because decoder start token id is not "
                "available.")
92
93
94
95
            dec_start_token_id = self.get_bos_token_id()

        return dec_start_token_id

96
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.

        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.

        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.

        However, it is possible that in the future
118
        other models may have different or more
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.

        Returns:

        * prompt_token_ids
        '''

        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]

    def _prepare_decoder_input_ids_for_generation(
        self,
134
135
        decoder_input_ids: Optional[list[int]],
    ) -> list[int]:
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.

        Based on

        https://github.com/huggingface/transformers/blob/
        4037a2b5b1278736e566aec12e169100275545ea/
        src/transformers/generation/utils.py

        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()

        Arguments:

        * decoder_input_ids: input token ids to preprocess

        Returns:

        * Processed token list
        """

        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None

        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()

164
165
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
166
167
168
169
170
171
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids

        return decoder_input_ids

    def _apply_prompt_adapter(
        self,
172
        prompt_token_ids: list[int],
173
        prompt_adapter_request: Optional[PromptAdapterRequest],
174
    ) -> list[int]:
175
176
177
178
179
180
181
182
183
184
185
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)

        return prompt_token_ids

    def _tokenize_prompt(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
186
    ) -> list[int]:
187
188
189
190
191
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
        tokenizer = self.get_tokenizer_group()
192
193
194
195
196
197
        add_special_tokens = None
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
            add_special_tokens = False
198
199
200
201
202
203

        if (self.model_config.encoder_config is not None
                and self.model_config.encoder_config.get(
                    "do_lower_case", False)):
            prompt = prompt.lower()

204
205
206
207
208
209
        if self.model_config.tokenizer_mode == "cpm":
                return [tokenizer.bos_id] + tokenizer.encode(prompt)
        else:
            return tokenizer.encode(prompt=prompt,
                                    lora_request=lora_request,
                                    add_special_tokens=add_special_tokens)
210
211
212
213
214

    async def _tokenize_prompt_async(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
215
    ) -> list[int]:
216
217
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group()
218
219
220
221
222
223
224
225
226
227
        add_special_tokens = None
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
            add_special_tokens = False
        return await tokenizer.encode_async(
            prompt=prompt,
            lora_request=lora_request,
            add_special_tokens=add_special_tokens)
228

229
230
    def _process_multimodal(
        self,
231
        prompt: Union[str, list[int]],
232
233
234
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
235
        return_mm_hashes: bool = False,
236
    ) -> MultiModalInputs:
237
238
239
240
        """
        Apply the model's multi-modal processor to a multi-modal prompt,
        returning the corresponding token IDs and metadata.
        """
241
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
242
        # initialized without a tokenizer while using also multi-modal input
243
        if not self.tokenizer:
244
            tokenizer = object()  # Dummy
245
246
247
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
248

249
250
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
251
252
253
254

        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

255
256
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
257
258
259

    async def _process_multimodal_async(
        self,
260
        prompt: Union[str, list[int]],
261
262
263
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
264
        return_mm_hashes: bool = False,
265
    ) -> MultiModalInputs:
266
        """Async version of :meth:`_process_multimodal`."""
267
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
268
        # initialized without a tokenizer while using also multi-modal input
269
        if not self.tokenizer:
270
            tokenizer = object()  # Dummy
271
272
273
274
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                lora_request)
275

276
277
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
278
279
280
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

281
282
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
283

284
    def _prompt_to_llm_inputs(
285
        self,
286
        prompt: SingletonPrompt,
287
        lora_request: Optional[LoRARequest] = None,
288
        return_mm_hashes: bool = False,
289
    ) -> SingletonInputs:
290
291
        """
        Extract the singleton inputs from a prompt.
292
293
294

        Arguments:

295
        * prompt: single encoder or decoder input prompt
296
        * lora_request: this is only valid for decoder prompts
297
        * return_mm_hashes: whether to return multimodal hashes
298
299
300

        Returns:

301
302
        * :class:`SingletonInputs` instance
        """
303
        parsed = parse_singleton_prompt(prompt)
304
305

        if parsed["type"] == "str":
306
            prompt_text = parsed["content"]
307
            prompt_token_ids = self._tokenize_prompt(
308
                prompt_text,
309
310
                lora_request=lora_request,
            )
311
312
313
314
315
316
317
318
319
320

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if parsed["type"] == "tokens":
            tokens_content = parsed["content"]

            prompt_token_ids = tokens_content["prompt_token_ids"]
321
            token_type_ids = tokens_content.get("token_type_ids")
322
323
324
            multi_modal_data = tokens_content.get("multi_modal_data")
            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")

325
            if multi_modal_data is not None:
326
327
328
329
330
                return self._process_multimodal(
                    prompt_token_ids,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
331
                    return_mm_hashes=return_mm_hashes,
332
333
                )

334
335
            return token_inputs(
                prompt_token_ids=prompt_token_ids,
336
                token_type_ids=token_type_ids,
337
338
339
340
341
342
            )

        if parsed["type"] == "text":
            text_content = parsed["content"]

            prompt_text = text_content["prompt"]
343
344
345
            multi_modal_data = text_content.get("multi_modal_data")
            mm_processor_kwargs = text_content.get("mm_processor_kwargs")

346
            if multi_modal_data is not None:
347
348
349
350
351
                return self._process_multimodal(
                    prompt_text,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
352
                    return_mm_hashes=return_mm_hashes,
353
354
                )

355
            prompt_token_ids = self._tokenize_prompt(
356
                prompt_text,
357
358
                lora_request=lora_request,
            )
359
360
361
362
363

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )
364

365
        assert_never(parsed)
366

367
    async def _prompt_to_llm_inputs_async(
368
        self,
369
        prompt: SingletonPrompt,
370
        lora_request: Optional[LoRARequest] = None,
371
        return_mm_hashes: bool = False,
372
    ) -> SingletonInputs:
373
        """Async version of :meth:`_extract_prompt_components`."""
374
        parsed = parse_singleton_prompt(prompt)
375
376

        if parsed["type"] == "str":
377
            prompt_text = parsed["content"]
378
            prompt_token_ids = await self._tokenize_prompt_async(
379
                prompt_text,
380
381
                lora_request=lora_request,
            )
382
383
384
385
386
387
388
389
390
391
392
393
394

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )

        if parsed["type"] == "tokens":
            tokens_content = parsed["content"]

            prompt_token_ids = tokens_content["prompt_token_ids"]
            multi_modal_data = tokens_content.get("multi_modal_data")
            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")

395
            if multi_modal_data is not None:
396
397
398
399
400
                return await self._process_multimodal_async(
                    prompt_token_ids,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
401
                    return_mm_hashes=return_mm_hashes,
402
403
                )

404
            return token_inputs(prompt_token_ids=prompt_token_ids)
405
406
407
408
409

        if parsed["type"] == "text":
            text_content = parsed["content"]

            prompt_text = text_content["prompt"]
410
411
412
            multi_modal_data = text_content.get("multi_modal_data")
            mm_processor_kwargs = text_content.get("mm_processor_kwargs")

413
            if multi_modal_data is not None:
414
415
416
417
418
                return await self._process_multimodal_async(
                    prompt_text,
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
419
                    return_mm_hashes=return_mm_hashes,
420
421
                )

422
            prompt_token_ids = await self._tokenize_prompt_async(
423
                prompt_text,
424
425
                lora_request=lora_request,
            )
426
427
428
429
430

            return token_inputs(
                prompt=prompt_text,
                prompt_token_ids=prompt_token_ids,
            )
431

432
        assert_never(parsed)
433
434
435

    def _build_enc_dec_llm_inputs(
        self,
436
437
        encoder_inputs: SingletonInputs,
        decoder_inputs: Optional[SingletonInputs],
438
    ) -> EncoderDecoderInputs:
439
440
        if (encoder_inputs["type"] == "token"
                or encoder_inputs["type"] == "multimodal"):
441
442
            pass
        else:
443
            assert_never(encoder_inputs)  # type: ignore[arg-type]
444
445

        if decoder_inputs is None:
446
447
448
449
450
451
452
453
454
            if self.model_config.hf_config.model_type == "whisper":
                # For Whisper models, the text prompt should go to the decoder.
                # If no explicit encoder/decoder inputs, then copy the prompt
                # from the encoder to the decoder. The encoder tokens are later
                # overridden by the audio features.
                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
            else:
                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                    None)
455
            decoder_inputs = token_inputs(dec_token_ids)
456
457
        elif (decoder_inputs["type"] == "token"
              or decoder_inputs["type"] == "multimodal"):
458
459
460
461
462
463
464
465
            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                decoder_inputs["prompt_token_ids"])
            decoder_inputs["prompt_token_ids"] = dec_token_ids

            if "multi_modal_data" in decoder_inputs:
                raise ValueError("Multi-modal decoder inputs of encoder-"
                                 "decoder models are not supported yet")
        else:
466
            assert_never(encoder_inputs)  # type: ignore[arg-type]
467

468
        return EncoderDecoderInputs(
469
470
            encoder=encoder_inputs,
            decoder=decoder_inputs,
471
472
        )

473
474
475
476
    def _separate_enc_dec_inputs_from_mm_processor_outputs(
        self,
        inputs: SingletonInputs,
        decoder_inputs_to_override: Optional[SingletonInputs] = None,
477
    ) -> tuple[SingletonInputs, SingletonInputs]:
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
        """
        For encoder/decoder models only:
        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
        """
        encoder_inputs: SingletonInputs
        decoder_inputs: SingletonInputs
        if inputs["type"] == "multimodal":
            # Multimodal data inputs
            assert ("encoder_prompt" in inputs
                    and "encoder_prompt_token_ids" in inputs)
            inputs = cast(MultiModalEncDecInputs, inputs)
            encoder_inputs = token_inputs(
                prompt=inputs["encoder_prompt"],
                prompt_token_ids=inputs["encoder_prompt_token_ids"],
            )
            if decoder_inputs_to_override is not None:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=decoder_inputs_to_override.get("prompt", ""),
                    prompt_token_ids=decoder_inputs_to_override[
                        "prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
500
                    mm_hashes=inputs["mm_hashes"],
501
502
503
504
505
506
507
508
                    mm_placeholders=inputs["mm_placeholders"],
                )
            else:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=inputs["prompt"],
                    prompt_token_ids=inputs["prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
509
                    mm_hashes=inputs["mm_hashes"],
510
511
512
513
514
515
516
517
518
519
                    mm_placeholders=inputs["mm_placeholders"],
                )
        elif inputs["type"] == "token":
            # Text-only inputs
            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
        return encoder_inputs, decoder_inputs

520
521
    def _process_encoder_decoder_prompt(
        self,
522
        prompt: PromptType,
523
    ) -> EncoderDecoderInputs:
524
        """
525
        For encoder/decoder models only:
526
        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544

        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.

        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids

        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
545

546
547
        Arguments:

548
        * prompt: an input prompt
549
550
551

        Returns:

552
        * :class:`EncoderDecoderInputs` instance
553
        """
554
555
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
556

557
        if is_explicit_encoder_decoder_prompt(prompt):
558
            encoder_inputs = self._prompt_to_llm_inputs(
559
                prompt["encoder_prompt"])
560
            if (decoder_input := prompt["decoder_prompt"]) is None:
561
                decoder_inputs = None
562
            else:
563
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
564
565
            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
566
            if self.model_config.is_multimodal_model:
567
568
569
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
570
        else:
571
            inputs = self._prompt_to_llm_inputs(prompt)
572
            if self.model_config.is_multimodal_model:
573
574
575
576
577
578
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
579

580
                decoder_inputs = None
581
582

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
583
584
585

    async def _process_encoder_decoder_prompt_async(
        self,
586
        prompt: PromptType,
587
    ) -> EncoderDecoderInputs:
588
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
589
590
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
591

592
        if is_explicit_encoder_decoder_prompt(prompt):
593
            encoder_task = self._prompt_to_llm_inputs_async(
594
                prompt["encoder_prompt"])
595

596
            if (decoder_input := prompt["decoder_prompt"]) is None:
597
598
                encoder_inputs = await encoder_task
                decoder_inputs = None
599
            else:
600
                decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
601

602
                encoder_inputs, decoder_inputs = await asyncio.gather(
603
                    encoder_task, decoder_task)
604
605
606

            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
607
            if self.model_config.is_multimodal_model:
608
609
610
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
611
        else:
612
            inputs = await self._prompt_to_llm_inputs_async(prompt)
613
            if self.model_config.is_multimodal_model:
614
615
616
617
618
619
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
620

621
                decoder_inputs = None
622
623

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
624
625
626

    def _build_decoder_only_llm_inputs(
        self,
627
        prompt_inputs: DecoderOnlyInputs,
628
        prompt_adapter_request: Optional[PromptAdapterRequest],
629
    ) -> DecoderOnlyInputs:
630
631
        if (prompt_inputs["type"] == "token"
                or prompt_inputs["type"] == "multimodal"):
632
633
634
635
636
            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                prompt_inputs["prompt_token_ids"],
                prompt_adapter_request=prompt_adapter_request,
            )
        else:
637
            assert_never(prompt_inputs)  # type: ignore[arg-type]
638

639
        return prompt_inputs
640
641
642

    def _process_decoder_only_prompt(
        self,
643
        prompt: SingletonPrompt,
644
645
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
646
        return_mm_hashes: bool = False,
647
    ) -> DecoderOnlyInputs:
648
        """
649
        For decoder-only models:
650
        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
651
652
653

        Arguments:

654
        * prompt: input prompt
655
656
        * lora_request
        * prompt_adapter_request
657
        * return_mm_hashes
658
659
660

        Returns:

661
        * :class:`DecoderOnlyInputs` instance
662
        """
663

664
        prompt_comps = self._prompt_to_llm_inputs(
665
            prompt,
666
            lora_request=lora_request,
667
            return_mm_hashes=return_mm_hashes,
668
669
670
671
672
673
674
675
676
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    async def _process_decoder_only_prompt_async(
        self,
677
        prompt: SingletonPrompt,
678
679
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
680
        return_mm_hashes: bool = False,
681
    ) -> DecoderOnlyInputs:
682
        """Async version of :meth:`_process_decoder_only_prompt`."""
683
        prompt_comps = await self._prompt_to_llm_inputs_async(
684
            prompt,
685
            lora_request=lora_request,
686
            return_mm_hashes=return_mm_hashes,
687
688
689
690
691
692
693
694
695
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    def preprocess(
        self,
696
        prompt: PromptType,
697
698
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
699
        return_mm_hashes: bool = False,
700
    ) -> ProcessorInputs:
701
        """Preprocess the input prompt."""
702
        if self.model_config.is_encoder_decoder:
703
704
705
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
706
707
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
708
            return self._process_encoder_decoder_prompt(prompt)
709

710
        if is_explicit_encoder_decoder_prompt(prompt):
711
712
713
714
715
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return self._process_decoder_only_prompt(
716
            prompt,
717
718
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
719
            return_mm_hashes=return_mm_hashes,
720
721
722
723
        )

    async def preprocess_async(
        self,
724
        prompt: PromptType,
725
726
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
727
        return_mm_hashes: bool = False,
728
    ) -> ProcessorInputs:
729
        """Async version of :meth:`preprocess`."""
730
        if self.model_config.is_encoder_decoder:
731
732
733
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
734
735
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
736
            return await self._process_encoder_decoder_prompt_async(prompt)
737

738
        if is_explicit_encoder_decoder_prompt(prompt):
739
740
741
742
743
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return await self._process_decoder_only_prompt_async(
744
            prompt,
745
746
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
747
            return_mm_hashes=return_mm_hashes,
748
        )