base.py 16.9 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import asyncio
4
from abc import ABC, abstractmethod
5
from collections.abc import Sequence
6
7
8
9
from functools import cached_property
from typing import TYPE_CHECKING, Any, Generic, overload

from typing_extensions import TypeVar
10

11
from vllm.inputs import EmbedsPrompt, TextPrompt, TokensPrompt
12
from vllm.logger import init_logger
13
from vllm.tokenizers import TokenizerLike
14
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
15
16
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.metrics.stats import MultiModalCacheStats
17
18

from .embed_utils import safe_load_prompt_embeds
19
20
21
22
23
24
from .inputs import (
    DictPrompt,
    EncoderDecoderDictPrompt,
    EncoderDecoderTokPrompt,
    TokPrompt,
)
25
from .inputs.preprocess import extract_target_prompt
26
from .params import ChatParams, TokenizeParams
27
28

if TYPE_CHECKING:
29
    from vllm.config import VllmConfig
30
31
32
33
    from vllm.entrypoints.chat_utils import (
        ChatCompletionMessageParam,
        ConversationMessage,
    )
34
35
    from vllm.multimodal.cache import BaseMultiModalProcessorCache
    from vllm.multimodal.processing import BaseMultiModalProcessor
36

37
38
logger = init_logger(__name__)

39

40
41
42
43
_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)


class BaseRenderer(ABC, Generic[_T]):
44
    @classmethod
45
    @abstractmethod
46
47
    def from_config(
        cls,
48
        config: "VllmConfig",
49
        tokenizer_kwargs: dict[str, Any],
50
    ) -> "BaseRenderer":
51
52
        raise NotImplementedError

53
    def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
54
55
        super().__init__()

56
        self.config = config
57
        self.model_config = config.model_config
58

59
60
        self.tokenizer = tokenizer

61
62
63
        # Lazy initialization since offline LLM doesn't use async
        self._async_tokenizer: AsyncMicrobatchTokenizer | None = None

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
        self.mm_processor: BaseMultiModalProcessor | None = None
        self._mm_cache_stats: MultiModalCacheStats | None = None
        if config.model_config.is_multimodal_model:
            from vllm.multimodal import MULTIMODAL_REGISTRY as mm_registry

            mm_processor_cache = mm_registry.processor_cache_from_config(config)

            with set_default_torch_num_threads():
                self.mm_processor = mm_registry.create_processor(
                    config.model_config,
                    config.observability_config,
                    tokenizer=tokenizer,
                    cache=mm_processor_cache,
                )

            if mm_processor_cache:
                self._mm_cache_stats = MultiModalCacheStats()
81

82
    def get_tokenizer(self) -> _T:
83
84
85
86
87
88
        tokenizer = self.tokenizer
        if tokenizer is None:
            raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")

        return tokenizer

89
    def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
90
        if self._async_tokenizer is None:
91
92
93
94
            self._async_tokenizer = AsyncMicrobatchTokenizer(self.get_tokenizer())

        return self._async_tokenizer

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    def get_mm_processor(self) -> "BaseMultiModalProcessor":
        if self.mm_processor is None:
            raise ValueError("Multi-modal processor not available for text-only models")

        return self.mm_processor

    @property
    def mm_processor_cache(self) -> "BaseMultiModalProcessorCache | None":
        if self.mm_processor is None:
            return None

        return self.mm_processor.cache

    def stat_mm_cache(self) -> MultiModalCacheStats | None:
        mm_cache_stats = self._mm_cache_stats
        if mm_cache_stats is None:
            return None

        self._mm_cache_stats = MultiModalCacheStats()

        return mm_cache_stats

    def update_mm_cache_stats(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        mm_cache_stats = self._mm_cache_stats

        if mm_processor_cache and mm_cache_stats:
            delta = mm_processor_cache.make_stats(delta=True)
            mm_cache_stats.record(delta.total, delta.hits)

    def clear_mm_cache(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.clear_cache()

        if self._mm_cache_stats is not None:
            self._mm_cache_stats.reset = True

    def shutdown(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.close()

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    def get_bos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for BOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.bos_token_id

    def get_eos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for EOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.eos_token_id

156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
    def get_dec_start_token_id(self) -> int:
        """
        Obtain the decoder start token id employed by an encoder/decoder model,
        raising an error if it is not available.
        """
        dec_start_token_id = getattr(
            self.model_config.hf_config, "decoder_start_token_id", None
        )

        if dec_start_token_id is None:
            logger.warning_once(
                "Falling back on <BOS> for decoder start token id "
                "because decoder start token id is not available."
            )
            dec_start_token_id = self.get_bos_token_id()

        if dec_start_token_id is None:
            raise RuntimeError("Cannot find decoder start token id or <BOS>")

        return dec_start_token_id

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
    @cached_property
    def default_cmpl_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=True,
        )

    @cached_property
    def default_chat_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=False,
        )

207
    # Step 1: Convert raw inputs to prompts
208
    def render_prompt(
209
        self,
210
211
212
        prompt: DictPrompt | bytes,
    ) -> DictPrompt:
        if isinstance(prompt, bytes):
213
            embeds = safe_load_prompt_embeds(self.model_config, prompt)
214
            prompt = EmbedsPrompt(prompt_embeds=embeds)
215

216
        return prompt
217

218
    def render_prompts(
219
        self,
220
221
222
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        if len(prompts) == 0:
223
224
            raise ValueError("You must pass at least one prompt")

225
        return [self.render_prompt(prompt) for prompt in prompts]
226

227
    async def render_prompts_async(
228
        self,
229
230
231
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        return self.render_prompts(prompts)
232

233
    @abstractmethod
234
235
236
    def render_messages(
        self,
        messages: list["ChatCompletionMessageParam"],
237
        params: ChatParams,
238
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
239
240
241
242
243
        raise NotImplementedError

    async def render_messages_async(
        self,
        messages: list["ChatCompletionMessageParam"],
244
        params: ChatParams,
245
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
246
247
248
        return self.render_messages(messages, params)

    # Step 2: Tokenize prompts if necessary
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
    def _tokenize_prompt(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt_token_ids = tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    async def _tokenize_prompt_async(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt_token_ids = await tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    def _detokenize_prompt(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt["prompt"] = tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    async def _detokenize_prompt_async(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt["prompt"] = await tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    def _tokenize_enc_dec_prompt(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = (
            self.tokenize_prompt(prompt["encoder_prompt"], params),
            (
                None
                if prompt["decoder_prompt"] is None
                else self.tokenize_prompt(prompt["decoder_prompt"], params)
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    async def _tokenize_enc_dec_prompt_async(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = await asyncio.gather(
            self.tokenize_prompt_async(prompt["encoder_prompt"], params),
            (
                asyncio.sleep(0)
                if prompt["decoder_prompt"] is None
                else self.tokenize_prompt_async(prompt["decoder_prompt"], params)
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    @overload
326
327
    def tokenize_prompt(
        self,
328
        prompt: TextPrompt | TokensPrompt,
329
        params: TokenizeParams,
330
    ) -> TokensPrompt: ...
331

332
333
334
335
336
337
    @overload
    def tokenize_prompt(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...
338

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
    @overload
    def tokenize_prompt(  # type: ignore[misc]
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt: ...

    def tokenize_prompt(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return self._tokenize_enc_dec_prompt(prompt, params)  # type: ignore[arg-type]

        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)
            prompt = self._tokenize_prompt(prompt, params)
357
358
359
360
361

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

362
            prompt = self._detokenize_prompt(prompt)  # type: ignore[arg-type]
363
364
365
366
367

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    def tokenize_prompts(
        self,
368
        prompts: Sequence[DictPrompt],
369
        params: TokenizeParams,
370
    ) -> list[TokPrompt]:
371
372
        return [self.tokenize_prompt(prompt, params) for prompt in prompts]

373
    @overload
374
375
    async def tokenize_prompt_async(
        self,
376
        prompt: TextPrompt | TokensPrompt,
377
        params: TokenizeParams,
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    ) -> TokensPrompt: ...

    @overload
    async def tokenize_prompt_async(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...

    @overload
    async def tokenize_prompt_async(  # type: ignore[misc]
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt: ...
393

394
395
396
397
398
399
400
    async def tokenize_prompt_async(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return await self._tokenize_enc_dec_prompt_async(prompt, params)  # type: ignore[arg-type]
401

402
403
404
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)
            prompt = await self._tokenize_prompt_async(prompt, params)
405
406
407
408
409

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

410
            prompt = await self._detokenize_prompt_async(prompt)  # type: ignore[arg-type]
411
412
413
414
415

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    async def tokenize_prompts_async(
        self,
416
        prompts: Sequence[DictPrompt],
417
        params: TokenizeParams,
418
    ) -> list[TokPrompt]:
419
420
421
        return await asyncio.gather(
            *(self.tokenize_prompt_async(prompt, params) for prompt in prompts)
        )
422
423
424
425
426
427
428
429
430
431
432

    # Step 3: Add extra keys to the prompts
    def _apply_prompt_extras(
        self,
        prompts: Sequence[DictPrompt | TokPrompt],
        prompt_extras: dict[str, Any] | None,
    ):
        if not prompt_extras:
            return

        for prompt in prompts:
433
            target_prompt = extract_target_prompt(self.model_config, prompt)
434
435
436
437
438
439
            target_prompt.update(prompt_extras)  # type: ignore[arg-type]

    # Top-level methods
    def render_cmpl(
        self,
        prompts: Sequence[DictPrompt | bytes],
440
        tok_params: TokenizeParams | None = None,
441
442
443
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
444
445
        if tok_params is None:
            tok_params = self.default_cmpl_tok_params
446

447
        dict_prompts = self.render_prompts(prompts)
448
449
450
451
452
453
454
455
456
457
        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return tok_prompts

    async def render_cmpl_async(
        self,
        prompts: Sequence[DictPrompt | bytes],
458
        tok_params: TokenizeParams | None = None,
459
460
461
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
462
463
        if tok_params is None:
            tok_params = self.default_cmpl_tok_params
464

465
        dict_prompts = await self.render_prompts_async(prompts)
466
467
468
469
470
471
472
473
474
475
476
        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return tok_prompts

    def render_chat(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
477
        tok_params: TokenizeParams | None = None,
478
479
480
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
481
482
483
        if tok_params is None:
            tok_params = self.default_chat_tok_params

484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
        rendered = [
            self.render_messages(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in rendered:
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return out_conversations, tok_prompts

    async def render_chat_async(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
506
        tok_params: TokenizeParams | None = None,
507
508
509
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
510
511
512
        if tok_params is None:
            tok_params = self.default_chat_tok_params

513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
        rendered = [
            self.render_messages_async(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in await asyncio.gather(*rendered):
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return out_conversations, tok_prompts