api_server.py 11.4 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import re
5
import signal
6
7
from contextlib import asynccontextmanager
from http import HTTPStatus
8
from typing import Optional, Set
9

10
11
12
import fastapi
import uvicorn
from fastapi import APIRouter, Request
Zhuohan Li's avatar
Zhuohan Li committed
13
14
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
15
16
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
17
from starlette.routing import Mount
Zhuohan Li's avatar
Zhuohan Li committed
18

19
import vllm.envs as envs
Woosuk Kwon's avatar
Woosuk Kwon committed
20
21
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
22
from vllm.entrypoints.logger import RequestLogger
23
from vllm.entrypoints.openai.cli_args import make_arg_parser
24
25
# yapf conflicts with isort for this block
# yapf: disable
26
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
27
                                              ChatCompletionResponse,
28
                                              CompletionRequest,
29
30
31
32
33
34
                                              DetokenizeRequest,
                                              DetokenizeResponse,
                                              EmbeddingRequest, ErrorResponse,
                                              TokenizeRequest,
                                              TokenizeResponse)
# yapf: enable
35
36
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
37
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
38
39
from vllm.entrypoints.openai.serving_tokenization import (
    OpenAIServingTokenization)
40
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
41
from vllm.usage.usage_lib import UsageContext
Ethan Xu's avatar
Ethan Xu committed
42
from vllm.utils import FlexibleArgumentParser
43
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
44

45
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
46

Ethan Xu's avatar
Ethan Xu committed
47
48
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
49
50
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
51
openai_serving_embedding: OpenAIServingEmbedding
52
openai_serving_tokenization: OpenAIServingTokenization
53

54
logger = init_logger('vllm.entrypoints.openai.api_server')
55

56
_running_tasks: Set[asyncio.Task] = set()
57

58

59
@asynccontextmanager
60
async def lifespan(app: fastapi.FastAPI):
61
62
63
64
65
66
67

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
68
69
70
        task = asyncio.create_task(_force_log())
        _running_tasks.add(task)
        task.add_done_callback(_running_tasks.remove)
71
72
73
74

    yield


Ethan Xu's avatar
Ethan Xu committed
75
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
76

77

78
def mount_metrics(app: fastapi.FastAPI):
79
80
81
82
83
    # Add prometheus asgi middleware to route /metrics requests
    metrics_route = Mount("/metrics", make_asgi_app())
    # Workaround for 307 Redirect for /metrics
    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
    app.routes.append(metrics_route)
84
85


Ethan Xu's avatar
Ethan Xu committed
86
@router.get("/health")
87
88
async def health() -> Response:
    """Health check."""
89
    await openai_serving_chat.engine.check_health()
90
91
92
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
93
@router.post("/tokenize")
94
async def tokenize(request: TokenizeRequest):
95
    generator = await openai_serving_tokenization.create_tokenize(request)
96
97
98
99
100
101
102
103
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, TokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
104
@router.post("/detokenize")
105
async def detokenize(request: DetokenizeRequest):
106
    generator = await openai_serving_tokenization.create_detokenize(request)
107
108
109
110
111
112
113
114
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, DetokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
115
@router.get("/v1/models")
Zhuohan Li's avatar
Zhuohan Li committed
116
async def show_available_models():
117
    models = await openai_serving_completion.show_available_models()
118
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
119
120


Ethan Xu's avatar
Ethan Xu committed
121
@router.get("/version")
122
async def show_version():
123
    ver = {"version": VLLM_VERSION}
124
125
126
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
127
@router.post("/v1/chat/completions")
128
129
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
130
131
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
132
133
134
135
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
136
        return StreamingResponse(content=generator,
137
                                 media_type="text/event-stream")
138
    else:
139
        assert isinstance(generator, ChatCompletionResponse)
140
        return JSONResponse(content=generator.model_dump())
141
142


Ethan Xu's avatar
Ethan Xu committed
143
@router.post("/v1/completions")
144
async def create_completion(request: CompletionRequest, raw_request: Request):
145
146
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
147
148
149
150
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
151
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
152
                                 media_type="text/event-stream")
153
    else:
154
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
155
156


Ethan Xu's avatar
Ethan Xu committed
157
@router.post("/v1/embeddings")
158
159
160
161
162
163
164
165
166
167
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
    generator = await openai_serving_embedding.create_embedding(
        request, raw_request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        return JSONResponse(content=generator.model_dump())


168
169
def build_app(args):
    app = fastapi.FastAPI(lifespan=lifespan)
Ethan Xu's avatar
Ethan Xu committed
170
171
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
172

173
174
    mount_metrics(app)

Zhuohan Li's avatar
Zhuohan Li committed
175
176
177
178
179
180
181
182
    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
183
184
185
186
187
188
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
        err = openai_serving_chat.create_error_response(message=str(exc))
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

189
    if token := envs.VLLM_API_KEY or args.api_key:
190
191
192

        @app.middleware("http")
        async def authentication(request: Request, call_next):
193
            root_path = "" if args.root_path is None else args.root_path
194
195
            if request.method == "OPTIONS":
                return await call_next(request)
196
            if not request.url.path.startswith(f"{root_path}/v1"):
197
198
199
200
201
202
203
204
205
206
207
208
209
210
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
211
212
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
213

Ethan Xu's avatar
Ethan Xu committed
214
215
216
    return app


217
218
219
220
221
async def build_server(
    args,
    llm_engine: Optional[AsyncLLMEngine] = None,
    **uvicorn_kwargs,
) -> uvicorn.Server:
Ethan Xu's avatar
Ethan Xu committed
222
223
    app = build_app(args)

224
    if args.served_model_name is not None:
225
        served_model_names = args.served_model_name
226
    else:
227
        served_model_names = [args.model]
228

Ethan Xu's avatar
Ethan Xu committed
229
    global engine, engine_args
230

Ethan Xu's avatar
Ethan Xu committed
231
232
233
234
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
235

236
    model_config = await engine.get_model_config()
237

238
239
240
241
242
    if args.disable_log_requests:
        request_logger = None
    else:
        request_logger = RequestLogger(max_log_len=args.max_log_len)

Ethan Xu's avatar
Ethan Xu committed
243
244
245
    global openai_serving_chat
    global openai_serving_completion
    global openai_serving_embedding
246
    global openai_serving_tokenization
Ethan Xu's avatar
Ethan Xu committed
247

248
249
250
251
252
253
254
255
256
    openai_serving_chat = OpenAIServingChat(
        engine,
        model_config,
        served_model_names,
        args.response_role,
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
        chat_template=args.chat_template,
257
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
258
    )
259
    openai_serving_completion = OpenAIServingCompletion(
260
261
262
263
264
265
        engine,
        model_config,
        served_model_names,
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
266
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
267
268
269
270
271
272
273
    )
    openai_serving_embedding = OpenAIServingEmbedding(
        engine,
        model_config,
        served_model_names,
        request_logger=request_logger,
    )
274
    openai_serving_tokenization = OpenAIServingTokenization(
275
276
277
278
279
280
281
        engine,
        model_config,
        served_model_names,
        lora_modules=args.lora_modules,
        request_logger=request_logger,
        chat_template=args.chat_template,
    )
282
    app.root_path = args.root_path
283

284
285
286
287
288
289
    logger.info("Available routes are:")
    for route in app.routes:
        if not hasattr(route, 'methods'):
            continue
        methods = ', '.join(route.methods)
        logger.info("Route: %s, Methods: %s", route.path, methods)
290

291
    config = uvicorn.Config(
292
293
294
295
296
297
298
299
300
301
302
303
        app,
        host=args.host,
        port=args.port,
        log_level=args.uvicorn_log_level,
        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
        ssl_keyfile=args.ssl_keyfile,
        ssl_certfile=args.ssl_certfile,
        ssl_ca_certs=args.ssl_ca_certs,
        ssl_cert_reqs=args.ssl_cert_reqs,
        **uvicorn_kwargs,
    )

304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    return uvicorn.Server(config)


async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
    logger.info("vLLM API server version %s", VLLM_VERSION)
    logger.info("args: %s", args)

    server = await build_server(
        args,
        llm_engine,
        **uvicorn_kwargs,
    )

    loop = asyncio.get_running_loop()

    server_task = loop.create_task(server.serve())

    def signal_handler() -> None:
        # prevents the uvicorn signal handler to exit early
        server_task.cancel()

    loop.add_signal_handler(signal.SIGINT, signal_handler)
    loop.add_signal_handler(signal.SIGTERM, signal_handler)

    try:
        await server_task
    except asyncio.CancelledError:
        print("Gracefully stopping http server")
        await server.shutdown()

Ethan Xu's avatar
Ethan Xu committed
334
335
336
337
338
339
340
341

if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
342
    asyncio.run(run_server(args))