"vscode:/vscode.git/clone" did not exist on "3112271f6e5d50b3d94a2efa88a5a8e77826b897"
base.py 16.2 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import asyncio
4
from abc import ABC, abstractmethod
5
from collections.abc import Sequence
6
7
8
9
from functools import cached_property
from typing import TYPE_CHECKING, Any, Generic, overload

from typing_extensions import TypeVar
10

11
from vllm.inputs import EmbedsPrompt, TextPrompt, TokensPrompt
12
from vllm.logger import init_logger
13
from vllm.tokenizers import TokenizerLike
14
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
15
16
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.metrics.stats import MultiModalCacheStats
17
18

from .embed_utils import safe_load_prompt_embeds
19
20
21
22
23
24
from .inputs import (
    DictPrompt,
    EncoderDecoderDictPrompt,
    EncoderDecoderTokPrompt,
    TokPrompt,
)
25
from .inputs.preprocess import extract_target_prompt
26
from .params import ChatParams, TokenizeParams
27
28

if TYPE_CHECKING:
29
    from vllm.config import VllmConfig
30
31
32
33
    from vllm.entrypoints.chat_utils import (
        ChatCompletionMessageParam,
        ConversationMessage,
    )
34
35
    from vllm.multimodal.cache import BaseMultiModalProcessorCache
    from vllm.multimodal.processing import BaseMultiModalProcessor
36

37
38
logger = init_logger(__name__)

39

40
41
42
43
_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)


class BaseRenderer(ABC, Generic[_T]):
44
    @classmethod
45
    @abstractmethod
46
47
    def from_config(
        cls,
48
        config: "VllmConfig",
49
        tokenizer_kwargs: dict[str, Any],
50
    ) -> "BaseRenderer":
51
52
        raise NotImplementedError

53
    def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
54
55
        super().__init__()

56
        self.config = config
57
        self.model_config = config.model_config
58

59
60
        self.tokenizer = tokenizer

61
62
63
        # Lazy initialization since offline LLM doesn't use async
        self._async_tokenizer: AsyncMicrobatchTokenizer | None = None

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
        self.mm_processor: BaseMultiModalProcessor | None = None
        self._mm_cache_stats: MultiModalCacheStats | None = None
        if config.model_config.is_multimodal_model:
            from vllm.multimodal import MULTIMODAL_REGISTRY as mm_registry

            mm_processor_cache = mm_registry.processor_cache_from_config(config)

            with set_default_torch_num_threads():
                self.mm_processor = mm_registry.create_processor(
                    config.model_config,
                    config.observability_config,
                    tokenizer=tokenizer,
                    cache=mm_processor_cache,
                )

            if mm_processor_cache:
                self._mm_cache_stats = MultiModalCacheStats()
81

82
    def get_tokenizer(self) -> _T:
83
84
85
86
87
88
        tokenizer = self.tokenizer
        if tokenizer is None:
            raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")

        return tokenizer

89
    def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
90
        if self._async_tokenizer is None:
91
92
93
94
            self._async_tokenizer = AsyncMicrobatchTokenizer(self.get_tokenizer())

        return self._async_tokenizer

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    def get_mm_processor(self) -> "BaseMultiModalProcessor":
        if self.mm_processor is None:
            raise ValueError("Multi-modal processor not available for text-only models")

        return self.mm_processor

    @property
    def mm_processor_cache(self) -> "BaseMultiModalProcessorCache | None":
        if self.mm_processor is None:
            return None

        return self.mm_processor.cache

    def stat_mm_cache(self) -> MultiModalCacheStats | None:
        mm_cache_stats = self._mm_cache_stats
        if mm_cache_stats is None:
            return None

        self._mm_cache_stats = MultiModalCacheStats()

        return mm_cache_stats

    def update_mm_cache_stats(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        mm_cache_stats = self._mm_cache_stats

        if mm_processor_cache and mm_cache_stats:
            delta = mm_processor_cache.make_stats(delta=True)
            mm_cache_stats.record(delta.total, delta.hits)

    def clear_mm_cache(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.clear_cache()

        if self._mm_cache_stats is not None:
            self._mm_cache_stats.reset = True

    def shutdown(self) -> None:
        mm_processor_cache = self.mm_processor_cache
        if mm_processor_cache is not None:
            mm_processor_cache.close()

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    def get_bos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for BOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.bos_token_id

    def get_eos_token_id(self) -> int | None:
        if self.tokenizer is None:
            logger.warning_once(
                "Using None for EOS token id because tokenizer is not initialized"
            )
            return None

        return self.tokenizer.eos_token_id

156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
    @cached_property
    def default_cmpl_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=True,
        )

    @cached_property
    def default_chat_tok_params(self) -> TokenizeParams:
        mm_processor = self.mm_processor
        if mm_processor is not None:
            return mm_processor.info.default_tok_params

        model_config = self.model_config
        encoder_config = model_config.encoder_config or {}

        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            do_lower_case=encoder_config.get("do_lower_case", False),
            add_special_tokens=False,
        )

186
    # Step 1: Convert raw inputs to prompts
187
    def render_prompt(
188
        self,
189
190
191
        prompt: DictPrompt | bytes,
    ) -> DictPrompt:
        if isinstance(prompt, bytes):
192
            embeds = safe_load_prompt_embeds(self.model_config, prompt)
193
            prompt = EmbedsPrompt(prompt_embeds=embeds)
194

195
        return prompt
196

197
    def render_prompts(
198
        self,
199
200
201
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        if len(prompts) == 0:
202
203
            raise ValueError("You must pass at least one prompt")

204
        return [self.render_prompt(prompt) for prompt in prompts]
205

206
    async def render_prompts_async(
207
        self,
208
209
210
        prompts: Sequence[DictPrompt | bytes],
    ) -> list[DictPrompt]:
        return self.render_prompts(prompts)
211

212
    @abstractmethod
213
214
215
    def render_messages(
        self,
        messages: list["ChatCompletionMessageParam"],
216
        params: ChatParams,
217
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
218
219
220
221
222
        raise NotImplementedError

    async def render_messages_async(
        self,
        messages: list["ChatCompletionMessageParam"],
223
        params: ChatParams,
224
    ) -> tuple[list["ConversationMessage"], DictPrompt]:
225
226
227
        return self.render_messages(messages, params)

    # Step 2: Tokenize prompts if necessary
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
    def _tokenize_prompt(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt_token_ids = tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    async def _tokenize_prompt_async(
        self,
        prompt: TextPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt_token_ids = await tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        return TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    def _detokenize_prompt(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_tokenizer()
        prompt["prompt"] = tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    async def _detokenize_prompt_async(self, prompt: TokensPrompt) -> TokensPrompt:
        tokenizer = self.get_async_tokenizer()
        prompt["prompt"] = await tokenizer.decode(prompt["prompt_token_ids"])

        return prompt

    def _tokenize_enc_dec_prompt(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = (
            self.tokenize_prompt(prompt["encoder_prompt"], params),
            (
                None
                if prompt["decoder_prompt"] is None
                else self.tokenize_prompt(prompt["decoder_prompt"], params)
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    async def _tokenize_enc_dec_prompt_async(
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt:
        enc_prompt, dec_prompt = await asyncio.gather(
            self.tokenize_prompt_async(prompt["encoder_prompt"], params),
            (
                asyncio.sleep(0)
                if prompt["decoder_prompt"] is None
                else self.tokenize_prompt_async(prompt["decoder_prompt"], params)
            ),
        )

        return EncoderDecoderTokPrompt(
            encoder_prompt=enc_prompt,
            decoder_prompt=dec_prompt,
        )

    @overload
305
306
    def tokenize_prompt(
        self,
307
        prompt: TextPrompt | TokensPrompt,
308
        params: TokenizeParams,
309
    ) -> TokensPrompt: ...
310

311
312
313
314
315
316
    @overload
    def tokenize_prompt(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...
317

318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
    @overload
    def tokenize_prompt(  # type: ignore[misc]
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt: ...

    def tokenize_prompt(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return self._tokenize_enc_dec_prompt(prompt, params)  # type: ignore[arg-type]

        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)
            prompt = self._tokenize_prompt(prompt, params)
336
337
338
339
340

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

341
            prompt = self._detokenize_prompt(prompt)  # type: ignore[arg-type]
342
343
344
345
346

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    def tokenize_prompts(
        self,
347
        prompts: Sequence[DictPrompt],
348
        params: TokenizeParams,
349
    ) -> list[TokPrompt]:
350
351
        return [self.tokenize_prompt(prompt, params) for prompt in prompts]

352
    @overload
353
354
    async def tokenize_prompt_async(
        self,
355
        prompt: TextPrompt | TokensPrompt,
356
        params: TokenizeParams,
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
    ) -> TokensPrompt: ...

    @overload
    async def tokenize_prompt_async(  # type: ignore[misc]
        self,
        prompt: EmbedsPrompt,
        params: TokenizeParams,
    ) -> EmbedsPrompt: ...

    @overload
    async def tokenize_prompt_async(  # type: ignore[misc]
        self,
        prompt: EncoderDecoderDictPrompt,
        params: TokenizeParams,
    ) -> EncoderDecoderTokPrompt: ...
372

373
374
375
376
377
378
379
    async def tokenize_prompt_async(
        self,
        prompt: DictPrompt,
        params: TokenizeParams,
    ) -> TokPrompt:
        if "encoder_prompt" in prompt:
            return await self._tokenize_enc_dec_prompt_async(prompt, params)  # type: ignore[arg-type]
380

381
382
383
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)
            prompt = await self._tokenize_prompt_async(prompt, params)
384
385
386
387
388

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

389
            prompt = await self._detokenize_prompt_async(prompt)  # type: ignore[arg-type]
390
391
392
393
394

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    async def tokenize_prompts_async(
        self,
395
        prompts: Sequence[DictPrompt],
396
        params: TokenizeParams,
397
    ) -> list[TokPrompt]:
398
399
400
        return await asyncio.gather(
            *(self.tokenize_prompt_async(prompt, params) for prompt in prompts)
        )
401
402
403
404
405
406
407
408
409
410
411

    # Step 3: Add extra keys to the prompts
    def _apply_prompt_extras(
        self,
        prompts: Sequence[DictPrompt | TokPrompt],
        prompt_extras: dict[str, Any] | None,
    ):
        if not prompt_extras:
            return

        for prompt in prompts:
412
            target_prompt = extract_target_prompt(self.model_config, prompt)
413
414
415
416
417
418
            target_prompt.update(prompt_extras)  # type: ignore[arg-type]

    # Top-level methods
    def render_cmpl(
        self,
        prompts: Sequence[DictPrompt | bytes],
419
        tok_params: TokenizeParams | None = None,
420
421
422
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
423
424
        if tok_params is None:
            tok_params = self.default_cmpl_tok_params
425

426
        dict_prompts = self.render_prompts(prompts)
427
428
429
430
431
432
433
434
435
436
        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return tok_prompts

    async def render_cmpl_async(
        self,
        prompts: Sequence[DictPrompt | bytes],
437
        tok_params: TokenizeParams | None = None,
438
439
440
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
441
442
        if tok_params is None:
            tok_params = self.default_cmpl_tok_params
443

444
        dict_prompts = await self.render_prompts_async(prompts)
445
446
447
448
449
450
451
452
453
454
455
        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return tok_prompts

    def render_chat(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
456
        tok_params: TokenizeParams | None = None,
457
458
459
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
460
461
462
        if tok_params is None:
            tok_params = self.default_chat_tok_params

463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
        rendered = [
            self.render_messages(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in rendered:
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = self.tokenize_prompts(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return out_conversations, tok_prompts

    async def render_chat_async(
        self,
        conversations: Sequence[list["ChatCompletionMessageParam"]],
        chat_params: ChatParams,
485
        tok_params: TokenizeParams | None = None,
486
487
488
        *,
        prompt_extras: dict[str, Any] | None = None,
    ):
489
490
491
        if tok_params is None:
            tok_params = self.default_chat_tok_params

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
        rendered = [
            self.render_messages_async(conversation, chat_params)
            for conversation in conversations
        ]

        out_conversations = list[list["ConversationMessage"]]()
        dict_prompts = list[DictPrompt]()
        for conv, prompt in await asyncio.gather(*rendered):
            out_conversations.append(conv)
            dict_prompts.append(prompt)

        tok_prompts = await self.tokenize_prompts_async(dict_prompts, tok_params)

        self._apply_prompt_extras(tok_prompts, prompt_extras)

        # TODO: Apply multi-modal processor
        return out_conversations, tok_prompts