registry.py 11 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
from collections.abc import Mapping
4
from dataclasses import dataclass
5
from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
6

7
import torch.nn as nn
8
from typing_extensions import deprecated
9

10
from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
11
from vllm.inputs import InputProcessingContext
12
from vllm.logger import init_logger
13
14
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
                                               cached_tokenizer_from_config)
15
from vllm.utils import ClassRegistry
16

17
18
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                         ProcessingCache)
19
20
from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
                        DummyEncoderData, MultiModalProfiler)
21

22
23
24
if TYPE_CHECKING:
    from vllm.config import ModelConfig

25
26
logger = init_logger(__name__)

27
N = TypeVar("N", bound=type[nn.Module])
28
29
_I = TypeVar("_I", bound=BaseProcessingInfo)
_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
30
31


32
class ProcessingInfoFactory(Protocol[_I_co]):
33
34
35
36
37
    """
    Constructs a
    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
    instance from the context.
    """
38
39
40
41

    def __call__(
        self,
        ctx: InputProcessingContext,
42
43
44
45
46
47
    ) -> _I_co:
        ...


class DummyInputsBuilderFactory(Protocol[_I]):
    """
48
49
50
    Constructs a
    [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
    instance from the context.
51
52
53
54
55
56
57
    """

    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
        ...


class MultiModalProcessorFactory(Protocol[_I]):
58
59
60
61
62
    """
    Constructs a
    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
    instance from the context.
    """
63
64
65
66
67

    def __call__(
        self,
        info: _I,
        dummy_inputs: BaseDummyInputsBuilder[_I],
68
69
        *,
        cache: Optional[ProcessingCache] = None,
70
    ) -> BaseMultiModalProcessor[_I]:
71
        ...
72

73

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@dataclass(frozen=True)
class _ProcessorFactories(Generic[_I]):
    info: ProcessingInfoFactory[_I]
    processor: MultiModalProcessorFactory[_I]
    dummy_inputs: DummyInputsBuilderFactory[_I]

    def build_processor(
        self,
        ctx: InputProcessingContext,
        *,
        cache: Optional[ProcessingCache] = None,
    ):
        info = self.info(ctx)
        dummy_inputs_builder = self.dummy_inputs(info)
        return self.processor(info, dummy_inputs_builder, cache=cache)


91
92
class MultiModalRegistry:
    """
93
    A registry that dispatches data processing according to the model.
94
95
    """

96
    def __init__(self) -> None:
97
        self._processor_factories = ClassRegistry[nn.Module,
98
                                                  _ProcessorFactories]()
99

100
        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
101

102
103
104
105
106
107
    def reset_processor_cache(self) -> bool:
        """Reset the multi-modal processing cache."""
        self._processing_cache.reset()

        return True  # Success

108
109
110
111
    @deprecated("Legacy input processor/mapper pipeline has been removed. "
                "Please update your model runner to use "
                "`seq_group_metadata.multi_modal_data` directly without "
                "further processing.")
112
    def create_input_mapper(self, model_config: "ModelConfig"):
113
        return lambda data, mm_processor_kwargs: data
114

115
116
117
118
119
    def get_max_tokens_per_item_by_modality(
        self,
        model_config: "ModelConfig",
    ) -> Mapping[str, int]:
        """
120
        Get the maximum number of tokens per data item from each modality based
121
        on underlying model configuration.
122
        """
123
124
        if not model_config.is_multimodal_model:
            return {}
125

126
        processor = self.create_processor(model_config, disable_cache=False)
127
128
129
130
131
132
133
134
135
136
137
138
        profiler = MultiModalProfiler(processor)

        seq_len = model_config.max_model_len
        mm_limits = self.get_mm_limits_per_prompt(model_config)

        return profiler.get_mm_max_tokens(
            seq_len,
            {
                modality: 1
                for modality, limit in mm_limits.items() if limit > 0
            },
        )
139

140
141
142
143
144
145
    def get_max_tokens_per_item_by_nonzero_modality(
        self,
        model_config: "ModelConfig",
    ) -> Mapping[str, int]:
        """
        Get the maximum number of tokens per data item from each modality based
146
        on underlying model configuration, excluding modalities that user
147
148
149
        explicitly disabled via `limit_mm_per_prompt`.

        Note:
150
            This is currently directly used only in V1 for profiling the memory
151
152
            usage of a model.
        """
153
        mm_limits = self.get_mm_limits_per_prompt(model_config)
154
155
156
157
158

        return {
            key: max_tokens_per_mm_item
            for key, max_tokens_per_mm_item in
            self.get_max_tokens_per_item_by_modality(model_config).items()
159
            if mm_limits[key] > 0
160
161
        }

162
163
164
165
    def get_max_tokens_by_modality(
        self,
        model_config: "ModelConfig",
    ) -> Mapping[str, int]:
166
        """
167
        Get the maximum number of tokens from each modality
168
        for profiling the memory usage of a model.
169
        """
170
        mm_limits = self.get_mm_limits_per_prompt(model_config)
171

172
        return {
173
            key: mm_limits[key] * max_tokens_per_mm_item
174
175
            for key, max_tokens_per_mm_item in
            self.get_max_tokens_per_item_by_modality(model_config).items()
176
177
178
179
180
181
182
183
        }

    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
        """
        Get the maximum number of multi-modal tokens
        for profiling the memory usage of a model.
        """
        return sum(self.get_max_tokens_by_modality(model_config).values())
184

185
186
187
188
    @deprecated("Legacy input processor/mapper pipeline has been removed. "
                "Please update your model runner to use "
                "`seq_group_metadata.multi_modal_data` directly without "
                "further processing.")
189
190
    def init_mm_limits_per_prompt(
        self,
191
        model_config: "ModelConfig",
192
    ) -> None:
193
        pass
194
195
196

    def get_mm_limits_per_prompt(
        self,
197
        model_config: "ModelConfig",
198
199
200
201
    ) -> Mapping[str, int]:
        """
        Get the maximum number of multi-modal input instances for each modality
        that are allowed per prompt for a model class.
202
        """
203
204
        if not model_config.is_multimodal_model:
            return {}
205

206
        processor = self.create_processor(model_config, disable_cache=False)
207
208
        profiler = MultiModalProfiler(processor)
        return profiler.get_mm_limits()
209
210
211

    def register_processor(
        self,
212
213
214
215
        processor: MultiModalProcessorFactory[_I],
        *,
        info: ProcessingInfoFactory[_I],
        dummy_inputs: DummyInputsBuilderFactory[_I],
216
217
    ):
        """
218
219
        Register a multi-modal processor to a model class. The processor
        is constructed lazily, hence a factory method should be passed.
220
221
222
223
224
225

        When the model receives multi-modal data, the provided function is
        invoked to transform the data into a dictionary of model inputs.
        """

        def wrapper(model_cls: N) -> N:
226
            if self._processor_factories.contains(model_cls, strict=True):
227
                logger.warning(
228
                    "Model class %s already has a multi-modal processor "
229
230
231
                    "registered to %s. It is overwritten by the new one.",
                    model_cls, self)

232
233
234
235
236
            self._processor_factories[model_cls] = _ProcessorFactories(
                info=info,
                dummy_inputs=dummy_inputs,
                processor=processor,
            )
237
238
239
240
241

            return model_cls

        return wrapper

242
    def _get_model_cls(self, model_config: "ModelConfig"):
243
244
245
246
        # Avoid circular import
        from vllm.model_executor.model_loader import get_model_architecture

        model_cls, _ = get_model_architecture(model_config)
247
248
        return model_cls

249
250
251
252
    @deprecated("Legacy input processor/mapper pipeline has been removed. "
                "Please update your model runner to use "
                "`seq_group_metadata.multi_modal_data` directly without "
                "further processing.")
253
    def has_processor(self, model_config: "ModelConfig") -> bool:
254
        return True
255
256
257
258

    def create_processor(
        self,
        model_config: "ModelConfig",
259
        *,
260
        tokenizer: Optional[AnyTokenizer] = None,
261
        disable_cache: Optional[bool] = None,
262
    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
263
264
265
        """
        Create a multi-modal processor for a specific model and tokenizer.
        """
266
267
268
        if not model_config.is_multimodal_model:
            raise ValueError(f"{model_config.model} is not a multimodal model")

269
270
        if tokenizer is None:
            tokenizer = cached_tokenizer_from_config(model_config)
271
        if disable_cache is None:
272
273
            mm_config = model_config.get_multimodal_config()
            disable_cache = mm_config.disable_mm_preprocessor_cache
274

275
        model_cls = self._get_model_cls(model_config)
276
        factories = self._processor_factories[model_cls]
277
278

        ctx = InputProcessingContext(model_config, tokenizer)
279
        cache = None if disable_cache else self._processing_cache
280

281
        return factories.build_processor(ctx, cache=cache)
282
283
284
285
286

    def get_decoder_dummy_data(
        self,
        model_config: "ModelConfig",
        seq_len: int,
287
        mm_counts: Optional[Mapping[str, int]] = None,
288
289
290
291
292
293
    ) -> DummyDecoderData:
        """
        Create dummy data for profiling the memory usage of a model.

        The model is identified by ``model_config``.
        """
294
        processor = self.create_processor(model_config, disable_cache=False)
295
        profiler = MultiModalProfiler(processor)
296
        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
297
298
299
300
301
302
303
304
305
306
307
308
309
310

        # Having more tokens is over-conservative but otherwise fine
        token_ids = dummy_data.prompt_token_ids
        if len(token_ids) < seq_len:
            raise AssertionError(
                f"Expected at least {seq_len} dummy tokens for profiling, "
                f"but found {len(token_ids)} tokens instead.")

        return dummy_data

    def get_encoder_dummy_data(
        self,
        model_config: "ModelConfig",
        seq_len: int,
311
        mm_counts: Optional[Mapping[str, int]] = None,
312
313
314
315
316
317
    ) -> DummyEncoderData:
        """
        Create dummy data for profiling the memory usage of a model.

        The model is identified by ``model_config``.
        """
318
        processor = self.create_processor(model_config, disable_cache=False)
319
        profiler = MultiModalProfiler(processor)
320
        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
321
322
323
324
325

        # Having more tokens is over-conservative but otherwise fine
        token_ids = dummy_data.prompt_token_ids
        if len(token_ids) < seq_len:
            logger.warning_once(
326
327
328
329
                "Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.",  # noqa: E501
                seq_len,
                len(token_ids),
            )
330
331

        return dummy_data