api_server.py 11 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import re
5
6
from contextlib import asynccontextmanager
from http import HTTPStatus
7
from typing import Optional, Set
8

Zhuohan Li's avatar
Zhuohan Li committed
9
import fastapi
10
import uvicorn
Ethan Xu's avatar
Ethan Xu committed
11
from fastapi import APIRouter, Request
Zhuohan Li's avatar
Zhuohan Li committed
12
13
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
14
15
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
16
from starlette.routing import Mount
Zhuohan Li's avatar
Zhuohan Li committed
17

18
import vllm.envs as envs
Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.entrypoints.logger import RequestLogger
22
from vllm.entrypoints.openai.cli_args import make_arg_parser
23
24
# yapf conflicts with isort for this block
# yapf: disable
25
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
26
                                              ChatCompletionResponse,
27
                                              CompletionRequest,
28
29
30
31
32
33
                                              DetokenizeRequest,
                                              DetokenizeResponse,
                                              EmbeddingRequest, ErrorResponse,
                                              TokenizeRequest,
                                              TokenizeResponse)
# yapf: enable
34
35
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
36
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
37
38
from vllm.entrypoints.openai.serving_tokenization import (
    OpenAIServingTokenization)
39
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
40
from vllm.usage.usage_lib import UsageContext
Ethan Xu's avatar
Ethan Xu committed
41
from vllm.utils import FlexibleArgumentParser
42
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
43

44
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
45

Ethan Xu's avatar
Ethan Xu committed
46
47
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
48
49
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
50
openai_serving_embedding: OpenAIServingEmbedding
51
openai_serving_tokenization: OpenAIServingTokenization
52

53
logger = init_logger('vllm.entrypoints.openai.api_server')
54

55
_running_tasks: Set[asyncio.Task] = set()
56

57

58
59
60
61
62
63
64
65
66
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
67
68
69
        task = asyncio.create_task(_force_log())
        _running_tasks.add(task)
        task.add_done_callback(_running_tasks.remove)
70
71
72
73

    yield


Ethan Xu's avatar
Ethan Xu committed
74
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
75

76
77
78
79
80
81
82

def mount_metrics(app: fastapi.FastAPI):
    # Add prometheus asgi middleware to route /metrics requests
    metrics_route = Mount("/metrics", make_asgi_app())
    # Workaround for 307 Redirect for /metrics
    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
    app.routes.append(metrics_route)
83
84


Ethan Xu's avatar
Ethan Xu committed
85
@router.get("/health")
86
87
async def health() -> Response:
    """Health check."""
88
    await openai_serving_chat.engine.check_health()
89
90
91
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
92
@router.post("/tokenize")
93
async def tokenize(request: TokenizeRequest):
94
    generator = await openai_serving_tokenization.create_tokenize(request)
95
96
97
98
99
100
101
102
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, TokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
103
@router.post("/detokenize")
104
async def detokenize(request: DetokenizeRequest):
105
    generator = await openai_serving_tokenization.create_detokenize(request)
106
107
108
109
110
111
112
113
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, DetokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
114
@router.get("/v1/models")
Zhuohan Li's avatar
Zhuohan Li committed
115
async def show_available_models():
116
    models = await openai_serving_completion.show_available_models()
117
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
118
119


Ethan Xu's avatar
Ethan Xu committed
120
@router.get("/version")
121
async def show_version():
122
    ver = {"version": VLLM_VERSION}
123
124
125
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
126
@router.post("/v1/chat/completions")
127
128
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
129
130
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
131
132
133
134
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
135
        return StreamingResponse(content=generator,
136
                                 media_type="text/event-stream")
137
    else:
138
        assert isinstance(generator, ChatCompletionResponse)
139
        return JSONResponse(content=generator.model_dump())
140
141


Ethan Xu's avatar
Ethan Xu committed
142
@router.post("/v1/completions")
143
async def create_completion(request: CompletionRequest, raw_request: Request):
144
145
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
146
147
148
149
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
150
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
151
                                 media_type="text/event-stream")
152
    else:
153
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
154
155


Ethan Xu's avatar
Ethan Xu committed
156
@router.post("/v1/embeddings")
157
158
159
160
161
162
163
164
165
166
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
    generator = await openai_serving_embedding.create_embedding(
        request, raw_request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
167
168
169
170
def build_app(args):
    app = fastapi.FastAPI(lifespan=lifespan)
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
171

172
173
    mount_metrics(app)

Zhuohan Li's avatar
Zhuohan Li committed
174
175
176
177
178
179
180
181
    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
182
183
184
185
186
187
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
        err = openai_serving_chat.create_error_response(message=str(exc))
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

188
    if token := envs.VLLM_API_KEY or args.api_key:
189
190
191

        @app.middleware("http")
        async def authentication(request: Request, call_next):
192
            root_path = "" if args.root_path is None else args.root_path
193
194
            if request.method == "OPTIONS":
                return await call_next(request)
195
            if not request.url.path.startswith(f"{root_path}/v1"):
196
197
198
199
200
201
202
203
204
205
206
207
208
209
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
210
211
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
212

Ethan Xu's avatar
Ethan Xu committed
213
214
215
216
217
218
    return app


def run_server(args, llm_engine=None):
    app = build_app(args)

219
    logger.info("vLLM API server version %s", VLLM_VERSION)
220
    logger.info("args: %s", args)
Zhuohan Li's avatar
Zhuohan Li committed
221

222
    if args.served_model_name is not None:
223
        served_model_names = args.served_model_name
224
    else:
225
        served_model_names = [args.model]
226

Ethan Xu's avatar
Ethan Xu committed
227
    global engine, engine_args
228

Ethan Xu's avatar
Ethan Xu committed
229
230
231
232
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

    event_loop: Optional[asyncio.AbstractEventLoop]
    try:
        event_loop = asyncio.get_running_loop()
    except RuntimeError:
        event_loop = None

    if event_loop is not None and event_loop.is_running():
        # If the current is instanced by Ray Serve,
        # there is already a running event loop
        model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
        # When using single vLLM without engine_use_ray
        model_config = asyncio.run(engine.get_model_config())

248
249
250
251
252
    if args.disable_log_requests:
        request_logger = None
    else:
        request_logger = RequestLogger(max_log_len=args.max_log_len)

Ethan Xu's avatar
Ethan Xu committed
253
254
255
    global openai_serving_chat
    global openai_serving_completion
    global openai_serving_embedding
256
    global openai_serving_tokenization
Ethan Xu's avatar
Ethan Xu committed
257

258
259
260
261
262
263
264
265
266
267
    openai_serving_chat = OpenAIServingChat(
        engine,
        model_config,
        served_model_names,
        args.response_role,
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
        chat_template=args.chat_template,
    )
268
    openai_serving_completion = OpenAIServingCompletion(
269
270
271
272
273
274
275
276
277
278
279
280
281
        engine,
        model_config,
        served_model_names,
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
    )
    openai_serving_embedding = OpenAIServingEmbedding(
        engine,
        model_config,
        served_model_names,
        request_logger=request_logger,
    )
282
    openai_serving_tokenization = OpenAIServingTokenization(
283
284
285
286
287
288
289
        engine,
        model_config,
        served_model_names,
        lora_modules=args.lora_modules,
        request_logger=request_logger,
        chat_template=args.chat_template,
    )
290
    app.root_path = args.root_path
291
292
293
294
295
296
297
298

    logger.info("Available routes are:")
    for route in app.routes:
        if not hasattr(route, 'methods'):
            continue
        methods = ', '.join(route.methods)
        logger.info("Route: %s, Methods: %s", route.path, methods)

299
300
301
    uvicorn.run(app,
                host=args.host,
                port=args.port,
302
                log_level=args.uvicorn_log_level,
303
304
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
305
306
307
                ssl_certfile=args.ssl_certfile,
                ssl_ca_certs=args.ssl_ca_certs,
                ssl_cert_reqs=args.ssl_cert_reqs)
Ethan Xu's avatar
Ethan Xu committed
308
309
310
311
312
313
314
315
316
317


if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
    run_server(args)