input_processor.py 19.7 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import time
from collections.abc import Mapping
6
from typing import Any, Literal
7

8
import vllm.envs as envs
9
from vllm.config import VllmConfig
10
11
from vllm.inputs import (
    EngineInput,
12
    PromptType,
13
14
    SingletonInput,
    split_enc_dec_input,
15
)
16
17
18
19
from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
20
from vllm.multimodal.encoder_budget import MultiModalBudget
21
from vllm.multimodal.inputs import MultiModalFeatureSpec
22
from vllm.multimodal.utils import argsort_mm_positions
23
from vllm.platforms import current_platform
24
from vllm.pooling_params import PoolingParams
25
from vllm.renderers import BaseRenderer, renderer_from_config
26
from vllm.sampling_params import SamplingParams
27
from vllm.tasks import GENERATION_TASKS, POOLING_TASKS, SupportedTask
28
from vllm.tokenizers import TokenizerLike
29
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
30
from vllm.utils.jsontree import json_iter_leaves
31
32
33
34
35
36
37
38
39
from vllm.v1.engine import EngineCoreRequest

logger = init_logger(__name__)


class InputProcessor:
    def __init__(
        self,
        vllm_config: VllmConfig,
40
41
        renderer: BaseRenderer | None = None,
        *,
42
43
44
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
    ) -> None:
        self.vllm_config = vllm_config
45
        self.model_config = model_config = vllm_config.model_config
46
47
        self.cache_config = vllm_config.cache_config
        self.lora_config = vllm_config.lora_config
48
        self.scheduler_config = vllm_config.scheduler_config
49
        self.speculative_config = vllm_config.speculative_config
50
        self.structured_outputs_config = vllm_config.structured_outputs_config
51
        self.observability_config = vllm_config.observability_config
52

53
        self.generation_config_fields = model_config.try_get_generation_config()
54

55
        self.renderer = renderer or renderer_from_config(vllm_config)
56

57
58
59
60
        self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config)
        self.mm_encoder_cache_size = 0
        self.skip_prompt_length_check = False
        if self.supports_mm_inputs:
61
62
            mm_budget = MultiModalBudget(vllm_config, mm_registry)
            self.mm_encoder_cache_size = mm_budget.encoder_cache_size
63
64
65
            self.skip_prompt_length_check = (
                mm_budget.processor.info.skip_prompt_length_check
            )
66
            mm_budget.reset_cache()  # Not used anymore
67
68

        self.input_preprocessor = InputPreprocessor(
69
            vllm_config,
70
71
            renderer=renderer,
            mm_registry=mm_registry,
72
73
74
        )

    @property
75
    def tokenizer(self) -> TokenizerLike | None:
76
        return self.renderer.tokenizer
77

78
    def get_tokenizer(self) -> TokenizerLike:
79
        return self.renderer.get_tokenizer()
80

81
82
83
    def _validate_params(
        self,
        params: SamplingParams | PoolingParams,
84
85
        supported_tasks: tuple[SupportedTask, ...],
    ) -> None:
86
87
        """Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
        if isinstance(params, SamplingParams):
88
89
90
91
92
93
            supported_generation_tasks = [
                task for task in supported_tasks if task in GENERATION_TASKS
            ]
            if not supported_generation_tasks:
                raise ValueError("This model does not support generation")

94
95
96
97
98
99
            params.verify(
                self.model_config,
                self.speculative_config,
                self.structured_outputs_config,
                self.tokenizer,
            )
100

101
102
103
            if params.thinking_token_budget is not None and (
                self.vllm_config.reasoning_config is None
                or not self.vllm_config.reasoning_config.enabled
104
105
106
107
108
109
            ):
                raise ValueError(
                    "thinking_token_budget is set but reasoning_config is "
                    "not configured. Please set --reasoning-config to use "
                    "thinking_token_budget."
                )
110
        elif isinstance(params, PoolingParams):
111
112
113
            supported_pooling_tasks = [
                task for task in supported_tasks if task in POOLING_TASKS
            ]
114
115
            if not supported_pooling_tasks:
                raise ValueError("This model does not support pooling")
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

            if params.task is None:
                if "token_embed" in supported_pooling_tasks:
                    params.task = "token_embed"
                elif "token_classify" in supported_pooling_tasks:
                    params.task = "token_classify"
                elif "plugin" in supported_pooling_tasks:
                    params.task = "plugin"

            if params.task not in supported_pooling_tasks:
                raise ValueError(
                    f"Unsupported task: {params.task!r} "
                    f"Supported tasks: {supported_pooling_tasks}"
                )

            params.verify(self.model_config)
132
133
134
135
136
        else:
            raise TypeError(
                f"params must be either SamplingParams or PoolingParams, "
                f"but got {type(params).__name__}"
            )
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
        if lora_request is None:
            return

        # LoRA request passed in while LoRA is not enabled
        if not self.lora_config:
            raise ValueError(
                f"Got lora_request {lora_request} but LoRA is not enabled!"
            )

        if self.tokenizer is not None:
            logger.warning_once(
                "vLLM has deprecated support for supporting different "
                "tokenizers for different LoRAs. By default, vLLM uses base "
                "model's tokenizer. If you are using a LoRA "
                "with its own tokenizer, consider specifying `--tokenizer "
                "[lora_path]` to use the LoRA tokenizer."
            )

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    def _get_mm_identifier(
        self,
        mm_hash: str,
        lora_request: LoRARequest | None,
    ) -> str:
        """
        When enable_tower_connector_lora is True, multi-modal embeddings
        vary depending on the LoRA request. Therefore, the mm_hash must be
        generated based on the LoRA request to prevent incorrect cache hits.
        """
        if (
            lora_request is None
            or self.lora_config is None
            or not self.lora_config.enable_tower_connector_lora
        ):
            return mm_hash
        return f"{lora_request.lora_name}:{mm_hash}"

175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
    def inject_into_mm_cache(
        self,
        mm_hashes: dict[str, list[str]],
        mm_kwargs: dict[str, list],
    ) -> None:
        """Inject pre-processed mm_kwargs into the processor cache.

        Call this when mm_kwargs have already been through the HF processor
        externally (e.g. by a frontend that transfers pre-processed tensors
        to the backend).  This ensures MM cache hit rate metrics are reported
        accurately and avoids redundant processing on subsequent requests
        with the same images.

        Uses ``get_and_update_item()`` with an empty prompt_updates list,
        since token expansion has already been handled externally.
        """
        cache = self.renderer.mm_processor_cache
        if cache is None:
            return
        try:
            for modality, hashes in mm_hashes.items():
                items = mm_kwargs.get(modality, [])
                for i, mm_hash in enumerate(hashes):
                    if i < len(items) and items[i] is not None:
                        # Insert into cache via get_and_update_item.
                        # Use the returned item (may be an address for SHM
                        # cache or the original item for LRU cache).
                        items[i], _ = cache.get_and_update_item(
                            (items[i], []),
                            mm_hash,
                        )
            # Update cache stats to reflect the externally processed items
            self.renderer.update_mm_cache_stats()
        except Exception:
            logger.warning(
                "Failed to inject mm_kwargs into processor cache",
                exc_info=True,
            )

214
215
216
    @staticmethod
    def assign_request_id(request: EngineCoreRequest):
        """Replace the externally supplied request ID with an internal request ID
Jiayi Yan's avatar
Jiayi Yan committed
217
        that adds 8 random characters in order to ensure uniqueness.
218
219
220
221
222
223
224
        """
        if request.external_req_id is not None:
            raise ValueError(
                "The external_req_id field should not be set on EngineCoreRequests"
                " passed to vLLM; use the request_id field."
            )
        request.external_req_id = request.request_id
225
226
227
228
229
230
231
232
        if envs.VLLM_DISABLE_REQUEST_ID_RANDOMIZATION:
            logger.warning_once(
                "VLLM_DISABLE_REQUEST_ID_RANDOMIZATION is set and will be "
                "removed in a future release. Duplicate externally-provided "
                "request IDs may cause failures and/or subtle correctness errors."
            )
        else:
            request.request_id = f"{request.external_req_id}-{random_uuid():.8}"
233

234
235
236
    def process_inputs(
        self,
        request_id: str,
237
        prompt: PromptType | EngineInput,
238
        params: SamplingParams | PoolingParams,
239
        supported_tasks: tuple[SupportedTask, ...],
240
241
242
243
244
245
        arrival_time: float | None = None,
        lora_request: LoRARequest | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
        trace_headers: Mapping[str, str] | None = None,
        priority: int = 0,
        data_parallel_rank: int | None = None,
246
        resumable: bool = False,
247
    ) -> EngineCoreRequest:
248
        self._validate_params(params, supported_tasks)
249
        self._validate_lora(lora_request)
250

251
252
253
254
255
        parallel_config = self.vllm_config.parallel_config
        dp_size = parallel_config.data_parallel_size
        dp_local_size = parallel_config.data_parallel_size_local
        num_ranks = dp_local_size if parallel_config.local_engines_only else dp_size
        if data_parallel_rank is not None and not (0 <= data_parallel_rank < num_ranks):
256
257
            raise ValueError(
                f"data_parallel_rank {data_parallel_rank} "
258
                f"is out of range [0, {num_ranks})."
259
260
            )

261
        if isinstance(prompt, dict) and "type" in prompt:
262
263
264
265
266
267
268
            if tokenization_kwargs:
                logger.warning_once(
                    "Passing tokenization_kwargs to InputProcessor is deprecated "
                    "and will be removed in v0.18. You should instead pass "
                    "them to Renderer.render_cmpl() or Renderer.render_chat()."
                )

269
270
            if arrival_time is None:
                arrival_time = prompt.get("arrival_time", time.time())  # type: ignore[assignment]
271

272
            processed_inputs: EngineInput = prompt  # type: ignore[assignment]
273
        else:
274
275
276
277
278
279
            logger.warning_once(
                "Passing raw prompts to InputProcessor is deprecated "
                "and will be removed in v0.18. You should instead pass "
                "the outputs of Renderer.render_cmpl() or Renderer.render_chat()."
            )

280
281
            if arrival_time is None:
                arrival_time = time.time()
282

283
            processed_inputs = self.input_preprocessor.preprocess(
284
285
286
287
                prompt,
                tokenization_kwargs=tokenization_kwargs,
            )

288
        current_platform.validate_request(processed_inputs, params)
289

290
        encoder_inputs, decoder_inputs = split_enc_dec_input(processed_inputs)
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
        self._validate_model_inputs(encoder_inputs, decoder_inputs)

        # Mypy can be conservative for TypedDict unions; normalize access.
        if decoder_inputs["type"] == "embeds":
            prompt_token_ids = None
            prompt_embeds = decoder_inputs["prompt_embeds"]
        else:
            prompt_token_ids = decoder_inputs["prompt_token_ids"]
            prompt_embeds = None

        sampling_params = None
        pooling_params = None
        if isinstance(params, SamplingParams):
            # TODO: can we avoid cloning here in multiproc case?
            sampling_params = params.clone()
            # If unset max tokens, then generate up to the max_model_len.
            if sampling_params.max_tokens is None:
                seq_len = length_from_prompt_token_ids_or_embeds(
                    prompt_token_ids, prompt_embeds
                )
                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
312

313
            sampling_params.update_from_generation_config(
314
                self.generation_config_fields,
315
                self.renderer.get_eos_token_id(),
316
317
318
319
320
321
322
323
324
325
326
327
328
329
            )
            if self.tokenizer is not None:
                sampling_params.update_from_tokenizer(self.tokenizer)
        else:
            pooling_params = params.clone()

        # Multimodal related.
        mm_features: list[MultiModalFeatureSpec] | None = None

        if decoder_inputs["type"] == "multimodal":
            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
            decoder_mm_positions = decoder_inputs["mm_placeholders"]
            decoder_mm_hashes = decoder_inputs["mm_hashes"]

330
331
332
333
334
335
336
337
338
            if not all(
                isinstance(leaf, str) for leaf in json_iter_leaves(decoder_mm_hashes)
            ):
                raise ValueError(
                    f"mm_hashes must contain only strings, got: {decoder_mm_hashes}. "
                    "This is likely due to an incorrect custom implementation of "
                    "MultiModalProcessor.apply method."
                )

339
340
341
342
343
344
345
            # Merge and flatten multimodal placeholders, hashes and inputs
            # from dictionaries to lists, and sort them by each item's position
            # in the input sequence.
            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)

            mm_features = []
            for modality, idx in sorted_mm_idxs:
346
                base_mm_hash = decoder_mm_hashes[modality][idx]
347
348
349
350
                mm_features.append(
                    MultiModalFeatureSpec(
                        data=decoder_mm_inputs[modality][idx],
                        modality=modality,
351
                        identifier=self._get_mm_identifier(
352
                            base_mm_hash,
353
354
                            lora_request,
                        ),
355
                        mm_position=decoder_mm_positions[modality][idx],
356
                        mm_hash=base_mm_hash,
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
                    )
                )

        return EngineCoreRequest(
            request_id=request_id,
            prompt_token_ids=prompt_token_ids,
            prompt_embeds=prompt_embeds,
            mm_features=mm_features,
            sampling_params=sampling_params,
            pooling_params=pooling_params,
            arrival_time=arrival_time,
            lora_request=lora_request,
            cache_salt=decoder_inputs.get("cache_salt"),
            priority=priority,
            data_parallel_rank=data_parallel_rank,
            trace_headers=trace_headers,
373
            resumable=resumable,
374
375
        )

376
    def _validate_prompt_len(
377
        self,
378
        prompt_len: int,
379
380
        prompt_type: Literal["encoder", "decoder"],
    ):
381
382
        if self.skip_prompt_length_check:
            return
383

384
385
        if prompt_len == 0 and prompt_type == "decoder":
            raise ValueError(f"The {prompt_type} prompt cannot be empty")
386

387
388
389
390
391
392
        model_config = self.model_config
        max_prompt_len = (
            model_config.max_model_len
            if prompt_type == "decoder"
            else self.mm_encoder_cache_size
        )
393
        if prompt_len > max_prompt_len:
394
            if self.supports_mm_inputs:
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well."
                )
            else:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens."
                )

            raise ValueError(
                f"The {prompt_type} prompt (length {prompt_len}) is "
                f"longer than the maximum model length of {max_prompt_len}. "
                f"{suggestion}"
            )
412
        elif prompt_len == max_prompt_len and model_config.runner_type == "generate":
413
414
415
416
417
418
419
420
421
422
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens (prompt + requested output tokens)."
            )
            raise ValueError(
                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
                f"requested output tokens (at least 1) is longer than the maximum "
                f"model length of {max_prompt_len}. {suggestion}"
            )

423
424
    def _validate_model_input(
        self,
425
        prompt_input: SingletonInput,
426
427
428
429
430
431
432
        prompt_type: Literal["encoder", "decoder"],
    ) -> None:
        model_config = self.model_config
        tokenizer = self.tokenizer

        prompt_ids = (
            None
433
434
            if prompt_input["type"] == "embeds"
            else prompt_input["prompt_token_ids"]
435
436
        )
        prompt_embeds = (
437
            prompt_input["prompt_embeds"] if prompt_input["type"] == "embeds" else None
438
439
440
441
442
        )

        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
        self._validate_prompt_len(prompt_len, prompt_type)

443
444
        if prompt_input["type"] == "multimodal":
            decoder_mm_positions = prompt_input["mm_placeholders"]
445
446
            for modality, mm_positions in decoder_mm_positions.items():
                for mm_position in mm_positions:
447
448
                    num_embeds = mm_position.get_num_embeds()
                    if num_embeds > self.mm_encoder_cache_size:
449
450
                        raise ValueError(
                            f"The {prompt_type} prompt contains a(n) {modality} item "
451
                            f"with {num_embeds} embedding tokens, which exceeds the "
452
453
454
455
456
457
                            f"pre-allocated encoder cache size "
                            f"{self.mm_encoder_cache_size}. Please reduce the input "
                            f"size or increase the encoder cache size "
                            f"by setting --limit-mm-per-prompt at startup."
                        )

458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
        if prompt_ids and tokenizer is not None:
            max_input_id = max(prompt_ids, default=0)

            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
            # self.model_config.get_vocab_size() is the model’s vocab size.
            # For Qwen3 models, the language model has extra tokens that do
            # not exist in the tokenizer, and vice versa for multimodal
            # placeholder tokens in some multimodal models.
            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501

            # Here we take the max of the two to determine if a token id is
            # truly out-of-vocabulary.
            model_vocab_size = model_config.get_vocab_size()
            if max_input_id > max(tokenizer.max_token_id, model_vocab_size - 1):
                raise ValueError(f"Token id {max_input_id} is out of vocabulary")

    def _validate_model_inputs(
        self,
477
478
        encoder_input: SingletonInput | None,
        decoder_input: SingletonInput,
479
    ):
480
481
        if encoder_input is not None:
            self._validate_model_input(encoder_input, prompt_type="encoder")
482

483
        self._validate_model_input(decoder_input, prompt_type="decoder")