serving_engine.py 9.17 KB
Newer Older
1
import asyncio
2
import json
3
from dataclasses import dataclass
4
from http import HTTPStatus
5
from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
6

7
8
9
from pydantic import Field
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from typing_extensions import Annotated
10

11
from vllm.engine.async_llm_engine import AsyncLLMEngine
12
13
14
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest, ErrorResponse,
                                              LogProbs, ModelCard, ModelList,
15
                                              ModelPermission)
16
from vllm.logger import init_logger
17
from vllm.lora.request import LoRARequest
18
from vllm.sequence import Logprob
19
from vllm.transformers_utils.tokenizer import get_tokenizer
20
21
22
23

logger = init_logger(__name__)


24
@dataclass
25
class LoRAModulePath:
26
27
28
29
    name: str
    local_path: str


30
31
class OpenAIServing:

32
33
34
35
36
    def __init__(self,
                 engine: AsyncLLMEngine,
                 served_model_names: List[str],
                 lora_modules: Optional[List[LoRAModulePath]],
                 await_post_init: Optional[Awaitable[Any]] = None):
37
        self.engine = engine
38
        self.served_model_names = served_model_names
39
40
41
42
43
44
45
46
47
48
        if lora_modules is None:
            self.lora_requests = []
        else:
            self.lora_requests = [
                LoRARequest(
                    lora_name=lora.name,
                    lora_int_id=i,
                    lora_local_path=lora.local_path,
                ) for i, lora in enumerate(lora_modules, start=1)
            ]
49
50

        self.max_model_len = 0
51
52
        # Lazy initialized
        self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
53
54
55
56
57
58

        try:
            event_loop = asyncio.get_running_loop()
        except RuntimeError:
            event_loop = None

59
60
61
        if event_loop is not None and event_loop.is_running():
            # If the current is instanced by Ray Serve,
            # there is already a running event loop
62
            event_loop.create_task(self._post_init(await_post_init))
63
64
        else:
            # When using single vLLM without engine_use_ray
65
            asyncio.run(self._post_init(await_post_init))
66

67
    async def _post_init(self, await_post_init):
68
69
70
71
72
73
74
        engine_model_config = await self.engine.get_model_config()
        self.max_model_len = engine_model_config.max_model_len

        # A separate tokenizer to map token IDs to strings.
        self.tokenizer = get_tokenizer(
            engine_model_config.tokenizer,
            tokenizer_mode=engine_model_config.tokenizer_mode,
75
            tokenizer_revision=engine_model_config.tokenizer_revision,
76
77
            trust_remote_code=engine_model_config.trust_remote_code,
            truncation_side="left")
78

79
80
81
        if await_post_init is not None:
            await await_post_init

82
83
84
    async def show_available_models(self) -> ModelList:
        """Show available models. Right now we only have one model."""
        model_cards = [
85
86
            ModelCard(id=served_model_name,
                      root=self.served_model_names[0],
87
                      permission=[ModelPermission()])
88
            for served_model_name in self.served_model_names
89
        ]
90
91
        lora_cards = [
            ModelCard(id=lora.lora_name,
92
                      root=self.served_model_names[0],
93
94
95
96
                      permission=[ModelPermission()])
            for lora in self.lora_requests
        ]
        model_cards.extend(lora_cards)
97
98
99
100
101
        return ModelList(data=model_cards)

    def _create_logprobs(
        self,
        token_ids: List[int],
102
        top_logprobs: List[Optional[Dict[int, Logprob]]],
103
104
105
106
107
108
109
110
        num_output_top_logprobs: Optional[int] = None,
        initial_text_offset: int = 0,
    ) -> LogProbs:
        """Create OpenAI-style logprobs."""
        logprobs = LogProbs()
        last_token_len = 0
        if num_output_top_logprobs:
            logprobs.top_logprobs = []
111

112
113
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
114
115
116
117
            if step_top_logprobs is None:
                token = self.tokenizer.decode(token_id)
                logprobs.tokens.append(token)
                logprobs.token_logprobs.append(None)
118
                assert logprobs.top_logprobs is not None
119
                logprobs.top_logprobs.append(None)
120
            else:
121
122
123
124
125
126
                token_logprob = step_top_logprobs[token_id].logprob
                token = step_top_logprobs[token_id].decoded_token
                logprobs.tokens.append(token)
                logprobs.token_logprobs.append(token_logprob)

                if num_output_top_logprobs:
127
                    assert logprobs.top_logprobs is not None
128
                    logprobs.top_logprobs.append({
129
130
131
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        p.decoded_token: max(p.logprob, -9999.0)
132
133
134
                        for i, p in step_top_logprobs.items()
                    } if step_top_logprobs else None)

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
            if len(logprobs.text_offset) == 0:
                logprobs.text_offset.append(initial_text_offset)
            else:
                logprobs.text_offset.append(logprobs.text_offset[-1] +
                                            last_token_len)
            last_token_len = len(token)
        return logprobs

    def create_error_response(
            self,
            message: str,
            err_type: str = "BadRequestError",
            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
        return ErrorResponse(message=message,
                             type=err_type,
                             code=status_code.value)

152
153
154
155
156
157
158
159
160
161
162
163
164
    def create_streaming_error_response(
            self,
            message: str,
            err_type: str = "BadRequestError",
            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
        json_str = json.dumps({
            "error":
            self.create_error_response(message=message,
                                       err_type=err_type,
                                       status_code=status_code).model_dump()
        })
        return json_str

165
166
167
    async def _check_model(
        self, request: Union[CompletionRequest, ChatCompletionRequest]
    ) -> Optional[ErrorResponse]:
168
        if request.model in self.served_model_names:
169
            return None
170
        if request.model in [lora.lora_name for lora in self.lora_requests]:
171
            return None
172
173
174
175
176
        return self.create_error_response(
            message=f"The model `{request.model}` does not exist.",
            err_type="NotFoundError",
            status_code=HTTPStatus.NOT_FOUND)

177
178
179
    def _maybe_get_lora(
        self, request: Union[CompletionRequest, ChatCompletionRequest]
    ) -> Optional[LoRARequest]:
180
        if request.model in self.served_model_names:
181
            return None
182
183
184
185
        for lora in self.lora_requests:
            if request.model == lora.lora_name:
                return lora
        # if _check_model has been called earlier, this will be unreachable
186
        raise ValueError(f"The model `{request.model}` does not exist.")
187

188
    def _validate_prompt_and_tokenize(
189
190
191
192
        self,
        request: Union[ChatCompletionRequest, CompletionRequest],
        prompt: Optional[str] = None,
        prompt_ids: Optional[List[int]] = None,
193
        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
194
    ) -> Tuple[List[int], str]:
195
196
197
198
199
200
        if not (prompt or prompt_ids):
            raise ValueError("Either prompt or prompt_ids should be provided.")
        if (prompt and prompt_ids):
            raise ValueError(
                "Only one of prompt or prompt_ids should be provided.")

201
202
203
204
205
206
207
208
209
210
211
        if prompt_ids is None:
            tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
                "truncation": True,
                "max_length": truncate_prompt_tokens,
            }
            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
        elif truncate_prompt_tokens is not None:
            input_ids = prompt_ids[-truncate_prompt_tokens:]
        else:
            input_ids = prompt_ids

212
213
        input_text = prompt if prompt is not None else self.tokenizer.decode(
            prompt_ids)
214
215
216
        token_num = len(input_ids)

        if request.max_tokens is None:
217
218
219
220
221
222
            if token_num >= self.max_model_len:
                raise ValueError(
                    f"This model's maximum context length is "
                    f"{self.max_model_len} tokens. However, you requested "
                    f"{token_num} tokens in the messages, "
                    f"Please reduce the length of the messages.", )
223
            request.max_tokens = self.max_model_len - token_num
224

225
        if token_num + request.max_tokens > self.max_model_len:
226
            raise ValueError(
227
228
229
                f"This model's maximum context length is "
                f"{self.max_model_len} tokens. However, you requested "
                f"{request.max_tokens + token_num} tokens "
230
231
232
233
                f"({token_num} in the messages, "
                f"{request.max_tokens} in the completion). "
                f"Please reduce the length of the messages or completion.", )
        else:
234
            return input_ids, input_text