preprocess.py 28.3 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import asyncio
4
from collections.abc import Mapping
5
from typing import Any, Optional, Union, cast
6
7
8
9
10
11

from typing_extensions import assert_never

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
12
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
13
14
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                    MultiModalInputs)
15
from vllm.prompt_adapter.request import PromptAdapterRequest
16
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
17

18
19
from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
20
21
from .parse import (ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt,
                    is_explicit_encoder_decoder_prompt, parse_singleton_prompt)
22
23
24
25
26
27
28
29
30

logger = init_logger(__name__)


class InputPreprocessor:

    def __init__(
        self,
        model_config: ModelConfig,
31
        tokenizer: Optional[TokenizerGroup],
32
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
33
34
35
36
37
    ) -> None:
        super().__init__()

        self.model_config = model_config
        self.tokenizer = tokenizer
38
        self.mm_registry = mm_registry
39

40
    def get_tokenizer_group(self) -> TokenizerGroup:
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
        if self.tokenizer is None:
            raise ValueError("You cannot pass text prompts when "
                             "`skip_tokenizer_init` is True")

        return self.tokenizer

    def get_bos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id

    def get_eos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None

        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id

    def get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''

74
        if not self.model_config.is_encoder_decoder:
75
76
77
            logger.warning_once(
                "Using None for decoder start token id because "
                "this is not an encoder/decoder model.")
78
79
80
            return None

        if (self.model_config is None or self.model_config.hf_config is None):
81
82
83
            logger.warning_once(
                "Using None for decoder start token id because "
                "model config is not available.")
84
85
86
87
88
            return None

        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
89
90
91
92
            logger.warning_once(
                "Falling back on <BOS> for decoder start token "
                "id because decoder start token id is not "
                "available.")
93
94
95
96
            dec_start_token_id = self.get_bos_token_id()

        return dec_start_token_id

97
    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.

        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.

        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.

        However, it is possible that in the future
119
        other models may have different or more
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.

        Returns:

        * prompt_token_ids
        '''

        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]

    def _prepare_decoder_input_ids_for_generation(
        self,
135
136
        decoder_input_ids: Optional[list[int]],
    ) -> list[int]:
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.

        Based on

        https://github.com/huggingface/transformers/blob/
        4037a2b5b1278736e566aec12e169100275545ea/
        src/transformers/generation/utils.py

        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()

        Arguments:

        * decoder_input_ids: input token ids to preprocess

        Returns:

        * Processed token list
        """

        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None

        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()

165
166
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
167
168
169
170
171
172
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids

        return decoder_input_ids

    def _apply_prompt_adapter(
        self,
173
        prompt_token_ids: list[int],
174
        prompt_adapter_request: Optional[PromptAdapterRequest],
175
    ) -> list[int]:
176
177
178
179
180
181
182
183
184
185
186
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)

        return prompt_token_ids

    def _tokenize_prompt(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
187
        tokenization_kwargs: Optional[dict[str, Any]] = None,
188
    ) -> list[int]:
189
190
191
192
193
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
        tokenizer = self.get_tokenizer_group()
194
195
196
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

197
198
199
200
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
201
            tokenization_kwargs["add_special_tokens"] = False
202
203
204
205
206
207

        if (self.model_config.encoder_config is not None
                and self.model_config.encoder_config.get(
                    "do_lower_case", False)):
            prompt = prompt.lower()

208
        return tokenizer.encode(prompt=prompt,
209
                                lora_request=lora_request,
210
                                **tokenization_kwargs)
211
212
213
214
215

    async def _tokenize_prompt_async(
        self,
        prompt: str,
        lora_request: Optional[LoRARequest],
216
        tokenization_kwargs: Optional[dict[str, Any]] = None,
217
    ) -> list[int]:
218
219
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group()
220
221
222
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

223
224
225
226
        if self.model_config.hf_config.model_type == "whisper":
            # For Whisper, special tokens should be provided by the user based
            # on the task and language of their request. Also needed to avoid
            # appending an EOS token to the prompt which disrupts generation.
227
228
229
230
            tokenization_kwargs["add_special_tokens"] = False
        return await tokenizer.encode_async(prompt=prompt,
                                            lora_request=lora_request,
                                            **tokenization_kwargs)
231

232
233
    def _process_multimodal(
        self,
234
        prompt: Union[str, list[int]],
235
236
237
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
238
        return_mm_hashes: bool = False,
239
    ) -> MultiModalInputs:
240
241
242
243
        """
        Apply the model's multi-modal processor to a multi-modal prompt,
        returning the corresponding token IDs and metadata.
        """
244
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
245
        # initialized without a tokenizer while using also multi-modal input
246
        if not self.tokenizer:
247
            tokenizer = object()  # Dummy
248
249
250
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
251

252
253
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
254
255
256
257

        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

258
259
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
260
261
262

    async def _process_multimodal_async(
        self,
263
        prompt: Union[str, list[int]],
264
265
266
        mm_data: MultiModalDataDict,
        mm_processor_kwargs: Optional[Mapping[str, object]],
        lora_request: Optional[LoRARequest],
267
        return_mm_hashes: bool = False,
268
    ) -> MultiModalInputs:
269
        """Async version of :meth:`_process_multimodal`."""
270
        # At the moment on model (PrithviGeoSpatialMAE) requires to be
271
        # initialized without a tokenizer while using also multi-modal input
272
        if not self.tokenizer:
273
            tokenizer = object()  # Dummy
274
275
276
277
        else:
            tokenizer_group = self.get_tokenizer_group()
            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                lora_request)
278

279
280
        mm_processor = self.mm_registry.create_processor(self.model_config,
                                                         tokenizer=tokenizer)
281
282
283
        if mm_processor_kwargs is None:
            mm_processor_kwargs = {}

284
285
        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                  return_mm_hashes)
286

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
    def _get_prompt_data(self, parsed_prompt: Union[ParsedStrPrompt,
                                                    ParsedTextPrompt,
                                                    ParsedTokensPrompt]):
        prompt_text = None
        prompt_token_ids = None
        token_type_ids = None
        cache_salt = None

        if parsed_prompt["type"] == "str":
            prompt_text = parsed_prompt["content"]
        else:
            cache_salt = parsed_prompt["content"].get("cache_salt")
            if parsed_prompt["type"] == "text":
                prompt_text = parsed_prompt["content"]["prompt"]
            elif parsed_prompt["type"] == "tokens":
                prompt_token_ids = parsed_prompt["content"].get(
                    "prompt_token_ids")
                token_type_ids = parsed_prompt["content"].get("token_type_ids")
            else:
                assert_never(parsed_prompt)

        return prompt_text, prompt_token_ids, token_type_ids, cache_salt

310
    def _prompt_to_llm_inputs(
311
        self,
312
        prompt: SingletonPrompt,
313
        tokenization_kwargs: Optional[dict[str, Any]] = None,
314
        lora_request: Optional[LoRARequest] = None,
315
        return_mm_hashes: bool = False,
316
    ) -> SingletonInputs:
317
318
        """
        Extract the singleton inputs from a prompt.
319
320
321

        Arguments:

322
        * prompt: single encoder or decoder input prompt
323
        * lora_request: this is only valid for decoder prompts
324
        * return_mm_hashes: whether to return multimodal hashes
325
326
327

        Returns:

328
329
        * :class:`SingletonInputs` instance
        """
330
        parsed = parse_singleton_prompt(prompt)
331
332
333
334
335
336
337
338
339
340
        prompt_text, prompt_token_ids, token_type_ids, cache_salt = \
            self._get_prompt_data(parsed)

        # If multimodal data is present, process and return immediately
        if parsed["type"] != "str" and parsed["content"].get(
                "multi_modal_data") is not None:
            inputs = self._process_multimodal(
                prompt_text if prompt_text is not None else prompt_token_ids,
                parsed["content"]["multi_modal_data"],
                parsed["content"].get("mm_processor_kwargs"),
341
                lora_request=lora_request,
342
                return_mm_hashes=return_mm_hashes,
343
            )
344
345
346
            if cache_salt is not None:
                inputs["cache_salt"] = cache_salt
            return inputs
347

348
        if prompt_token_ids is None:
349
            prompt_token_ids = self._tokenize_prompt(
350
                prompt_text,
351
                lora_request=lora_request,
352
                tokenization_kwargs=tokenization_kwargs,
353
            )
354

355
356
357
358
359
360
        return token_inputs(
            prompt=prompt_text,
            prompt_token_ids=prompt_token_ids,
            token_type_ids=token_type_ids,
            cache_salt=cache_salt,
        )
361

362
    async def _prompt_to_llm_inputs_async(
363
        self,
364
        prompt: SingletonPrompt,
365
        tokenization_kwargs: Optional[dict[str, Any]] = None,
366
        lora_request: Optional[LoRARequest] = None,
367
        return_mm_hashes: bool = False,
368
    ) -> SingletonInputs:
369
        """Async version of :meth:`_extract_prompt_components`."""
370
        parsed = parse_singleton_prompt(prompt)
371

372
373
        prompt_text, prompt_token_ids, token_type_ids, cache_salt = \
            self._get_prompt_data(parsed)
374

375
376
377
378
379
380
381
382
        if parsed["type"] != "str" and parsed["content"].get(
                "multi_modal_data") is not None:
            inputs = await self._process_multimodal_async(
                prompt_token_ids if prompt_text is None else prompt_text,
                parsed["content"]["multi_modal_data"],
                parsed["content"].get("mm_processor_kwargs"),
                lora_request=lora_request,
                return_mm_hashes=return_mm_hashes,
383
            )
384
385
386
            if cache_salt is not None:
                inputs["cache_salt"] = cache_salt
            return inputs
387

388
        if prompt_token_ids is None:
389
            prompt_token_ids = await self._tokenize_prompt_async(
390
                prompt_text,
391
                lora_request=lora_request,
392
                tokenization_kwargs=tokenization_kwargs,
393
            )
394

395
396
397
398
399
400
        return token_inputs(
            prompt=prompt_text,
            prompt_token_ids=prompt_token_ids,
            token_type_ids=token_type_ids,
            cache_salt=cache_salt,
        )
401
402
403

    def _build_enc_dec_llm_inputs(
        self,
404
405
        encoder_inputs: SingletonInputs,
        decoder_inputs: Optional[SingletonInputs],
406
    ) -> EncoderDecoderInputs:
407
408
        if (encoder_inputs["type"] == "token"
                or encoder_inputs["type"] == "multimodal"):
409
410
            pass
        else:
411
            assert_never(encoder_inputs)  # type: ignore[arg-type]
412
413

        if decoder_inputs is None:
414
415
416
417
418
419
420
421
422
            if self.model_config.hf_config.model_type == "whisper":
                # For Whisper models, the text prompt should go to the decoder.
                # If no explicit encoder/decoder inputs, then copy the prompt
                # from the encoder to the decoder. The encoder tokens are later
                # overridden by the audio features.
                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
            else:
                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                    None)
423
            decoder_inputs = token_inputs(dec_token_ids)
424
425
        elif (decoder_inputs["type"] == "token"
              or decoder_inputs["type"] == "multimodal"):
426
427
428
429
430
431
432
433
            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                decoder_inputs["prompt_token_ids"])
            decoder_inputs["prompt_token_ids"] = dec_token_ids

            if "multi_modal_data" in decoder_inputs:
                raise ValueError("Multi-modal decoder inputs of encoder-"
                                 "decoder models are not supported yet")
        else:
434
            assert_never(encoder_inputs)  # type: ignore[arg-type]
435

436
        return EncoderDecoderInputs(
437
438
            encoder=encoder_inputs,
            decoder=decoder_inputs,
439
440
        )

441
442
443
444
    def _separate_enc_dec_inputs_from_mm_processor_outputs(
        self,
        inputs: SingletonInputs,
        decoder_inputs_to_override: Optional[SingletonInputs] = None,
445
    ) -> tuple[SingletonInputs, SingletonInputs]:
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
        """
        For encoder/decoder models only:
        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
        """
        encoder_inputs: SingletonInputs
        decoder_inputs: SingletonInputs
        if inputs["type"] == "multimodal":
            # Multimodal data inputs
            assert ("encoder_prompt" in inputs
                    and "encoder_prompt_token_ids" in inputs)
            inputs = cast(MultiModalEncDecInputs, inputs)
            encoder_inputs = token_inputs(
                prompt=inputs["encoder_prompt"],
                prompt_token_ids=inputs["encoder_prompt_token_ids"],
            )
            if decoder_inputs_to_override is not None:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=decoder_inputs_to_override.get("prompt", ""),
                    prompt_token_ids=decoder_inputs_to_override[
                        "prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
468
                    mm_hashes=inputs["mm_hashes"],
469
470
471
472
473
474
475
476
                    mm_placeholders=inputs["mm_placeholders"],
                )
            else:
                decoder_inputs = MultiModalInputs(
                    type="multimodal",
                    prompt=inputs["prompt"],
                    prompt_token_ids=inputs["prompt_token_ids"],
                    mm_kwargs=inputs["mm_kwargs"],
477
                    mm_hashes=inputs["mm_hashes"],
478
479
                    mm_placeholders=inputs["mm_placeholders"],
                )
480
481
482
483
484

            cache_salt = inputs.get("cache_salt")
            if cache_salt is not None:
                decoder_inputs["cache_salt"] = cache_salt

485
486
487
488
489
490
491
492
        elif inputs["type"] == "token":
            # Text-only inputs
            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
            decoder_inputs = decoder_inputs_to_override or inputs
        else:
            assert_never(inputs)  # type: ignore[arg-type]
        return encoder_inputs, decoder_inputs

493
494
    def _process_encoder_decoder_prompt(
        self,
495
        prompt: PromptType,
496
        tokenization_kwargs: Optional[dict[str, Any]] = None,
497
    ) -> EncoderDecoderInputs:
498
        """
499
        For encoder/decoder models only:
500
        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518

        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.

        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids

        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
519

520
521
        Arguments:

522
        * prompt: an input prompt
523
524
525

        Returns:

526
        * :class:`EncoderDecoderInputs` instance
527
        """
528
529
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
530

531
        if is_explicit_encoder_decoder_prompt(prompt):
532
            encoder_inputs = self._prompt_to_llm_inputs(
533
534
535
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
536
            if (decoder_input := prompt["decoder_prompt"]) is None:
537
                decoder_inputs = None
538
            else:
539
                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
540
541
            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
542
            if self.model_config.is_multimodal_model:
543
544
545
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
546
        else:
547
548
549
550
            inputs = self._prompt_to_llm_inputs(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
551
            if self.model_config.is_multimodal_model:
552
553
554
555
556
557
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
558

559
                decoder_inputs = None
560
561

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
562
563
564

    async def _process_encoder_decoder_prompt_async(
        self,
565
        prompt: PromptType,
566
        tokenization_kwargs: Optional[dict[str, Any]] = None,
567
    ) -> EncoderDecoderInputs:
568
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
569
570
        encoder_inputs: SingletonInputs
        decoder_inputs: Optional[SingletonInputs]
571

572
        if is_explicit_encoder_decoder_prompt(prompt):
573
            encoder_task = self._prompt_to_llm_inputs_async(
574
575
576
                prompt["encoder_prompt"],
                tokenization_kwargs=tokenization_kwargs,
            )
577

578
            if (decoder_input := prompt["decoder_prompt"]) is None:
579
580
                encoder_inputs = await encoder_task
                decoder_inputs = None
581
            else:
582
583
584
585
                decoder_task = self._prompt_to_llm_inputs_async(
                    decoder_input,
                    tokenization_kwargs=tokenization_kwargs,
                )
586

587
                encoder_inputs, decoder_inputs = await asyncio.gather(
588
                    encoder_task, decoder_task)
589
590
591

            # For multimodal model, override decoder prompt from processor
            # with explicit decoder prompt.
592
            if self.model_config.is_multimodal_model:
593
594
595
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        encoder_inputs, decoder_inputs))
596
        else:
597
598
599
600
            inputs = await self._prompt_to_llm_inputs_async(
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )
601
            if self.model_config.is_multimodal_model:
602
603
604
605
606
607
                # Encoder-Decoder Multimodal model
                encoder_inputs, decoder_inputs = (
                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
                        inputs))
            else:
                encoder_inputs = inputs
608

609
                decoder_inputs = None
610
611

        return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
612
613
614

    def _build_decoder_only_llm_inputs(
        self,
615
        prompt_inputs: DecoderOnlyInputs,
616
        prompt_adapter_request: Optional[PromptAdapterRequest],
617
    ) -> DecoderOnlyInputs:
618
619
        if (prompt_inputs["type"] == "token"
                or prompt_inputs["type"] == "multimodal"):
620
621
622
623
624
            prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                prompt_inputs["prompt_token_ids"],
                prompt_adapter_request=prompt_adapter_request,
            )
        else:
625
            assert_never(prompt_inputs)  # type: ignore[arg-type]
626

627
        return prompt_inputs
628
629
630

    def _process_decoder_only_prompt(
        self,
631
        prompt: SingletonPrompt,
632
        tokenization_kwargs: Optional[dict[str, Any]] = None,
633
634
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
635
        return_mm_hashes: bool = False,
636
    ) -> DecoderOnlyInputs:
637
        """
638
        For decoder-only models:
639
        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
640
641
642

        Arguments:

643
        * prompt: input prompt
644
645
        * lora_request
        * prompt_adapter_request
646
        * return_mm_hashes
647
648
649

        Returns:

650
        * :class:`DecoderOnlyInputs` instance
651
        """
652

653
        prompt_comps = self._prompt_to_llm_inputs(
654
            prompt,
655
            tokenization_kwargs=tokenization_kwargs,
656
            lora_request=lora_request,
657
            return_mm_hashes=return_mm_hashes,
658
659
660
661
662
663
664
665
666
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    async def _process_decoder_only_prompt_async(
        self,
667
        prompt: SingletonPrompt,
668
        tokenization_kwargs: Optional[dict[str, Any]] = None,
669
670
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
671
        return_mm_hashes: bool = False,
672
    ) -> DecoderOnlyInputs:
673
        """Async version of :meth:`_process_decoder_only_prompt`."""
674
        prompt_comps = await self._prompt_to_llm_inputs_async(
675
            prompt,
676
            tokenization_kwargs=tokenization_kwargs,
677
            lora_request=lora_request,
678
            return_mm_hashes=return_mm_hashes,
679
680
681
682
683
684
685
686
687
        )

        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )

    def preprocess(
        self,
688
        prompt: PromptType,
689
        tokenization_kwargs: Optional[dict[str, Any]] = None,
690
691
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
692
        return_mm_hashes: bool = False,
693
    ) -> ProcessorInputs:
694
        """Preprocess the input prompt."""
695
        if self.model_config.is_encoder_decoder:
696
697
698
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
699
700
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
701
            return self._process_encoder_decoder_prompt(prompt)
702

703
        if is_explicit_encoder_decoder_prompt(prompt):
704
705
706
707
708
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return self._process_decoder_only_prompt(
709
            prompt,
710
            tokenization_kwargs=tokenization_kwargs,
711
712
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
713
            return_mm_hashes=return_mm_hashes,
714
715
716
717
        )

    async def preprocess_async(
        self,
718
        prompt: PromptType,
719
        tokenization_kwargs: Optional[dict[str, Any]] = None,
720
721
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
722
        return_mm_hashes: bool = False,
723
    ) -> ProcessorInputs:
724
        """Async version of :meth:`preprocess`."""
725
        if self.model_config.is_encoder_decoder:
726
727
728
            assert not return_mm_hashes, (
                "Multimodal hashes for encoder-decoder models should not be ",
                "returned until they are supported on vLLM V1.")
729
730
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
731
            return await self._process_encoder_decoder_prompt_async(prompt)
732

733
        if is_explicit_encoder_decoder_prompt(prompt):
734
735
736
737
738
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")

        # Decoder-only operation
        return await self._process_decoder_only_prompt_async(
739
            prompt,
740
            tokenization_kwargs=tokenization_kwargs,
741
742
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
743
            return_mm_hashes=return_mm_hashes,
744
        )