processor.py 9.53 KB
Newer Older
1
import time
2
from typing import Mapping, Optional, Union
3

4
from vllm.config import CacheConfig, LoRAConfig, ModelConfig
5
6
7
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                         PromptType, SingletonInputsAdapter)
from vllm.inputs.parse import is_encoder_decoder_inputs
8
9
from vllm.inputs.preprocess import InputPreprocessor
from vllm.lora.request import LoRARequest
10
11
12
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
                             MultiModalKwargs, MultiModalRegistry)
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
13
14
15
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
16
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
17
from vllm.v1.engine import EngineCoreRequest
18
from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
19
20
21
22
23
24
25


class Processor:

    def __init__(
        self,
        model_config: ModelConfig,
26
        cache_config: CacheConfig,
27
        lora_config: Optional[LoRAConfig],
28
        tokenizer: BaseTokenizerGroup,
29
        input_registry: InputRegistry = INPUT_REGISTRY,
30
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
31
32
33
34
35
36
    ):

        self.model_config = model_config
        self.lora_config = lora_config
        self.tokenizer = tokenizer

37
38
        self.generation_config_fields = model_config.try_get_generation_config(
        )
39
        self.input_preprocessor = InputPreprocessor(model_config,
40
41
                                                    self.tokenizer,
                                                    mm_registry)
42
43
44
        self.input_processor = input_registry.create_input_processor(
            model_config)

45
        # Multi-modal (huggingface) input mapper
46
47
48
        self.mm_input_mapper_client = MMInputMapperClient(model_config)

        # Multi-modal hasher (for images)
49
        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
50
            cache_config.enable_prefix_caching
51

52
53
54
55
56
    def process_inputs(
        self,
        request_id: str,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
57
        arrival_time: Optional[float] = None,
58
59
60
61
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
        priority: int = 0,
62
    ) -> EngineCoreRequest:
63

64
        # TODO(woosuk): Support pooling models.
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
        # TODO(woosuk): Check max_logprobs
        # TODO(woosuk): Support encoder-decoder models.

        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                             "not enabled!")
        if arrival_time is None:
            arrival_time = time.time()
        assert priority == 0, "vLLM V1 does not support priority at the moment."
        assert trace_headers is None, "vLLM V1 does not support tracing yet."

        # Process inputs.
        preprocessed_inputs = self.input_preprocessor.preprocess(
            prompt,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
        processed_inputs = self.input_processor(preprocessed_inputs)
        self._validate_model_inputs(processed_inputs)
        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

87
88
89
90
91
92
93
94
95
96
97
98
99
        if is_encoder_decoder_inputs(processed_inputs):
            decoder_inputs = SingletonInputsAdapter(
                processed_inputs["decoder"])
            encoder_inputs = SingletonInputsAdapter(
                processed_inputs["encoder"])
        else:
            decoder_inputs = SingletonInputsAdapter(processed_inputs)
            encoder_inputs = None

        # TODO: Impl encoder-decoder
        if encoder_inputs is not None:
            raise NotImplementedError

100
101
102
103
104
105
        assert isinstance(params, SamplingParams)
        # TODO: can we avoid cloning here in multiproc case
        sampling_params = params.clone()
        sampling_params.update_from_generation_config(
            self.generation_config_fields, eos_token_id)

106
107
108
109
110
111
112
113
114
115
116
117
        # Multimodal related.
        # Compute MM hashes (if enabled)
        mm_hashes = None
        if self.use_hash:
            # Use mm_hashes from processed inputs if the model has merged
            # input processor.
            if decoder_inputs.multi_modal_hashes:
                mm_hashes = decoder_inputs.multi_modal_hashes
            # Fallback to using MultiModalHasher directly.
            else:
                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)

118
        # For merged preprocessor, mm_data is already mm_inputs
119
        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
120
121
122
123
124
125
126
127
128
129
130
        decoder_mm_data = decoder_inputs.multi_modal_data
        if isinstance(decoder_mm_data, MultiModalKwargs):
            # The output of merged multi-modal processor (`decoder_mm_data`)
            # contains the kwargs for all items from all modalities.
            # This code separates them so that there is one set of kwargs
            # per item per modality.
            precomputed_mm_inputs = [
                MultiModalKwargs.from_items([item])
                for modality in decoder_mm_data.modalities
                for item in decoder_mm_data.get_items(modality)
            ]
131

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
        mm_positions = decoder_inputs.multi_modal_placeholders

        # Last-mile processing of multimodal metadata and inputs.
        if mm_positions:

            # Merge and flatten multimodal placeholders, hashes and inputs
            # from dictionaries to lists, and sort them by each item's position
            # in the input sequence.
            # NOTE: interleaved modalities are not supported.
            (
                sorted_modalities,
                sorted_mm_positions,
                sorted_mm_hashes,
            ) = merge_and_sort_multimodal_metadata(
                mm_positions,
147
148
                mm_hashes,
            )
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
            # modalities involved AND the model supports merged input processor.
            if len(sorted_modalities) > 1 and precomputed_mm_inputs:

                modality_order_dict = {
                    modality: order
                    for order, modality in enumerate(sorted_modalities)
                }

                # Sanity check to make sure each multimodal input has only one
                # modality key.
                for mm_input in precomputed_mm_inputs:
                    assert len(mm_input.modalities) == 1

                # Sort MultiModalKwags to match sorted_mm_positions
                precomputed_mm_inputs = sorted(
                    precomputed_mm_inputs,
                    key=lambda mm_input: modality_order_dict[list(
                        mm_input.modalities)[0]])

            # Apply mm input cache update (and input mapper if necessary).
            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
                mm_data=decoder_mm_data,
                mm_hashes=sorted_mm_hashes,
                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
                precomputed_mm_inputs=precomputed_mm_inputs,
            )
        else:
            sorted_mm_inputs = None
            sorted_mm_hashes = None
            sorted_mm_positions = None

182
        return EngineCoreRequest(
183
184
185
186
187
188
189
190
191
192
            request_id=request_id,
            prompt=decoder_inputs.prompt,
            prompt_token_ids=decoder_inputs.prompt_token_ids,
            mm_inputs=sorted_mm_inputs,
            mm_hashes=sorted_mm_hashes,
            mm_placeholders=sorted_mm_positions,
            sampling_params=sampling_params,
            eos_token_id=eos_token_id,
            arrival_time=arrival_time,
            lora_request=lora_request,
193
        )
194

195
196
197
198
199
200
201
202
203
204
205
    def _validate_model_inputs(self, inputs: ProcessorInputs):
        if is_encoder_decoder_inputs(inputs):
            # For encoder-decoder multimodal models, the max_prompt_len
            # restricts the decoder prompt length
            prompt_inputs = inputs["decoder" if self.model_config.
                                   is_multimodal_model else "encoder"]
        else:
            prompt_inputs = inputs

        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids

206
207
208
        if prompt_ids is None or len(prompt_ids) == 0:
            raise ValueError("Prompt cannot be empty")

209
210
211
212
213
        if len(prompt_ids) >= self.model_config.max_model_len:
            raise ValueError(
                f"Prompt length of {len(prompt_ids)} is longer than the "
                f"maximum model length of {self.model_config.max_model_len}.")

214
215
216
217
218
219
220
221
222
223
224
225
        if self.model_config.is_multimodal_model:
            max_prompt_len = self.model_config.max_model_len

            if len(prompt_ids) > max_prompt_len:
                raise ValueError(
                    f"The prompt (total length {len(prompt_ids)}) is too long "
                    f"to fit into the model (context length {max_prompt_len}). "
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well.")

226
227
228
            # TODO: Find out how many placeholder tokens are there so we can
            # check that chunked prefill does not truncate them
            # max_batch_len = self.scheduler_config.max_num_batched_tokens