serving.py 6.54 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
from dataclasses import dataclass
4
from typing import Any, Final
5

6
7
from fastapi import Request

8
from vllm.engine.protocol import EngineClient
9
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
10
from vllm.entrypoints.logger import RequestLogger
11
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
12
from vllm.entrypoints.openai.engine.serving import OpenAIServing
13
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
14
from vllm.entrypoints.serve.tokenize.protocol import (
15
16
17
18
19
20
21
    DetokenizeRequest,
    DetokenizeResponse,
    TokenizeChatRequest,
    TokenizeRequest,
    TokenizeResponse,
    TokenizerInfoResponse,
)
22
from vllm.inputs import TokensPrompt, token_inputs
23
from vllm.logger import init_logger
24
from vllm.tokenizers import TokenizerLike
25

26
27
logger = init_logger(__name__)

28
29

class OpenAIServingTokenization(OpenAIServing):
30
31
    def __init__(
        self,
32
        engine_client: EngineClient,
33
        models: OpenAIServingModels,
34
        *,
35
36
        request_logger: RequestLogger | None,
        chat_template: str | None,
37
        chat_template_content_format: ChatTemplateContentFormatOption,
38
        default_chat_template_kwargs: dict[str, Any] | None = None,
39
        trust_request_chat_template: bool = False,
40
    ) -> None:
41
42
43
44
45
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
        )
46

47
48
        self.chat_template = chat_template
        self.chat_template_content_format: Final = chat_template_content_format
49
        self.default_chat_template_kwargs = default_chat_template_kwargs or {}
50
        self.trust_request_chat_template = trust_request_chat_template
51

52
53
54
    async def create_tokenize(
        self,
        request: TokenizeRequest,
55
        raw_request: Request,
56
    ) -> TokenizeResponse | ErrorResponse:
57
58
59
60
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

61
        request_id = f"tokenize-{self._base_request_id(raw_request)}"
62

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
        lora_request = self._maybe_get_adapters(request)

        if isinstance(request, TokenizeChatRequest):
            tool_dicts = (
                None
                if request.tools is None
                else [tool.model_dump() for tool in request.tools]
            )
            error_check_ret = self._validate_chat_template(
                request_chat_template=request.chat_template,
                chat_template_kwargs=request.chat_template_kwargs,
                trust_request_chat_template=self.trust_request_chat_template,
            )
            if error_check_ret is not None:
                return error_check_ret

            _, engine_prompts = await self._preprocess_chat(
                request,
                request.messages,
                default_template=self.chat_template,
                default_template_content_format=self.chat_template_content_format,
84
                default_template_kwargs=self.default_chat_template_kwargs,
85
86
87
88
89
90
91
92
                tool_dicts=tool_dicts,
            )
        else:
            engine_prompts = await self._preprocess_completion(
                request,
                prompt_input=request.prompt,
                prompt_embeds=None,
            )
93

94
        input_ids: list[int] = []
95
        for engine_prompt in engine_prompts:
96
            self._log_inputs(
97
98
99
100
                request_id,
                engine_prompt,
                params=None,
                lora_request=lora_request,
101
            )
102

103
104
105
            prompt_components = self._extract_prompt_components(engine_prompt)
            if prompt_components.token_ids is not None:
                input_ids.extend(prompt_components.token_ids)
106

107
108
        token_strs = None
        if request.return_token_strs:
109
            tokenizer = self.renderer.get_tokenizer()
110
111
            token_strs = tokenizer.convert_ids_to_tokens(input_ids)

112
113
114
115
        return TokenizeResponse(
            tokens=input_ids,
            token_strs=token_strs,
            count=len(input_ids),
116
            max_model_len=self.model_config.max_model_len,
117
        )
118
119

    async def create_detokenize(
120
121
        self,
        request: DetokenizeRequest,
122
        raw_request: Request,
123
    ) -> DetokenizeResponse | ErrorResponse:
124
125
126
127
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

128
        request_id = f"tokenize-{self._base_request_id(raw_request)}"
129

130
        lora_request = self._maybe_get_adapters(request)
131

132
        self._log_inputs(
133
            request_id,
134
            token_inputs(request.tokens),
135
136
            params=None,
            lora_request=lora_request,
137
        )
138

139
140
141
        engine_prompt = await self.renderer.tokenize_prompt_async(
            TokensPrompt(prompt_token_ids=request.tokens),
            request.build_tok_params(self.model_config),
142
        )
143
        prompt_text = engine_prompt["prompt"]  # type: ignore[typeddict-item]
144

145
        return DetokenizeResponse(prompt=prompt_text)
146
147

    async def get_tokenizer_info(
148
        self,
149
    ) -> TokenizerInfoResponse | ErrorResponse:
150
        """Get comprehensive tokenizer information."""
151
152
153
        tokenizer = self.renderer.get_tokenizer()
        info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
        return TokenizerInfoResponse(**info)
154
155
156
157


@dataclass
class TokenizerInfo:
158
    tokenizer: TokenizerLike
159
    chat_template: str | None
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

    def to_dict(self) -> dict[str, Any]:
        """Return the tokenizer configuration."""
        return self._get_tokenizer_config()

    def _get_tokenizer_config(self) -> dict[str, Any]:
        """Get tokenizer configuration directly from the tokenizer object."""
        config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})

        # Remove file path fields
        config.pop("vocab_file", None)
        config.pop("merges_file", None)

        config = self._make_json_serializable(config)
        config["tokenizer_class"] = type(self.tokenizer).__name__
        if self.chat_template:
            config["chat_template"] = self.chat_template
        return config

    def _make_json_serializable(self, obj):
        """Convert any non-JSON-serializable objects to serializable format."""
        if hasattr(obj, "content"):
            return obj.content
        elif isinstance(obj, dict):
            return {k: self._make_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self._make_json_serializable(item) for item in obj]
        else:
            return obj