api_server.py 10 KB
Newer Older
Zhuohan Li's avatar
Zhuohan Li committed
1
import argparse
2
import asyncio
Zhuohan Li's avatar
Zhuohan Li committed
3
import json
4
from contextlib import asynccontextmanager
5
6
7
import os
import importlib
import inspect
8
import ssl
9

10
from prometheus_client import make_asgi_app
Zhuohan Li's avatar
Zhuohan Li committed
11
import fastapi
12
import uvicorn
13
from http import HTTPStatus
14
from fastapi import Request
Zhuohan Li's avatar
Zhuohan Li committed
15
16
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
17
from fastapi.responses import JSONResponse, StreamingResponse, Response
Zhuohan Li's avatar
Zhuohan Li committed
18

19
import vllm
Woosuk Kwon's avatar
Woosuk Kwon committed
20
21
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
22
23
24
from vllm.entrypoints.openai.protocol import (CompletionRequest,
                                              ChatCompletionRequest,
                                              ErrorResponse)
Woosuk Kwon's avatar
Woosuk Kwon committed
25
from vllm.logger import init_logger
26
27
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
28
from vllm.entrypoints.openai.serving_engine import LoRA
Zhuohan Li's avatar
Zhuohan Li committed
29

30
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
31

32
33
openai_serving_chat: OpenAIServingChat = None
openai_serving_completion: OpenAIServingCompletion = None
Zhuohan Li's avatar
Zhuohan Li committed
34
logger = init_logger(__name__)
35
36


37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
        asyncio.create_task(_force_log())

    yield


app = fastapi.FastAPI(lifespan=lifespan)


54
55
56
57
58
59
60
61
62
63
class LoRAParserAction(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        lora_list = []
        for item in values:
            name, path = item.split('=')
            lora_list.append(LoRA(name, path))
        setattr(namespace, self.dest, lora_list)


64
65
66
67
68
def parse_args():
    parser = argparse.ArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser.add_argument("--host", type=str, default=None, help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
69
70
71
72
73
74
    parser.add_argument(
        "--uvicorn-log-level",
        type=str,
        default="info",
        choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
        help="log level for uvicorn")
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    parser.add_argument("--allow-credentials",
                        action="store_true",
                        help="allow credentials")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
                        help="allowed origins")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
                        help="allowed methods")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
                        help="allowed headers")
90
91
92
93
94
    parser.add_argument("--api-key",
                        type=str,
                        default=None,
                        help="If provided, the server will require this key "
                        "to be presented in the header.")
95
96
97
98
99
100
    parser.add_argument("--served-model-name",
                        type=str,
                        default=None,
                        help="The model name used in the API. If not "
                        "specified, the model name will be the same as "
                        "the huggingface name.")
101
102
103
104
105
106
    parser.add_argument(
        "--lora-modules",
        type=str,
        default=None,
        nargs='+',
        action=LoRAParserAction,
107
108
        help="LoRA module configurations in the format name=path. "
        "Multiple modules can be specified.")
109
110
111
112
113
114
115
116
117
118
119
    parser.add_argument("--chat-template",
                        type=str,
                        default=None,
                        help="The file path to the chat template, "
                        "or the template in single-line form "
                        "for the specified model")
    parser.add_argument("--response-role",
                        type=str,
                        default="assistant",
                        help="The role name to return if "
                        "`request.add_generation_prompt=true`.")
120
121
122
123
124
125
126
127
    parser.add_argument("--ssl-keyfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL key file")
    parser.add_argument("--ssl-certfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL cert file")
128
129
130
131
132
133
134
135
136
137
    parser.add_argument("--ssl-ca-certs",
                        type=str,
                        default=None,
                        help="The CA certificates file")
    parser.add_argument(
        "--ssl-cert-reqs",
        type=int,
        default=int(ssl.CERT_NONE),
        help="Whether client certificate is required (see stdlib ssl module's)"
    )
138
139
140
141
142
    parser.add_argument(
        "--root-path",
        type=str,
        default=None,
        help="FastAPI root_path when app is behind a path based routing proxy")
143
144
145
146
147
148
149
150
    parser.add_argument(
        "--middleware",
        type=str,
        action="append",
        default=[],
        help="Additional ASGI middleware to apply to the app. "
        "We accept multiple --middleware arguments. "
        "The value should be an import path. "
151
152
153
154
        "If a function is provided, vLLM will add it to the server "
        "using @app.middleware('http'). "
        "If a class is provided, vLLM will add it to the server "
        "using app.add_middleware(). ")
155
156
157

    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser.parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
158
159


160
161
162
# Add prometheus asgi middleware to route /metrics requests
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)
163
164


Zhuohan Li's avatar
Zhuohan Li committed
165
@app.exception_handler(RequestValidationError)
166
async def validation_exception_handler(_, exc):
167
    err = openai_serving_chat.create_error_response(message=str(exc))
168
    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
169
170


171
172
173
@app.get("/health")
async def health() -> Response:
    """Health check."""
174
    await openai_serving_chat.engine.check_health()
175
176
177
    return Response(status_code=200)


Zhuohan Li's avatar
Zhuohan Li committed
178
179
@app.get("/v1/models")
async def show_available_models():
180
    models = await openai_serving_chat.show_available_models()
181
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
182
183


184
185
186
187
188
189
@app.get("/version")
async def show_version():
    ver = {"version": vllm.__version__}
    return JSONResponse(content=ver)


190
@app.post("/v1/chat/completions")
191
192
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
193
194
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
195
196
197
198
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
199
        return StreamingResponse(content=generator,
200
                                 media_type="text/event-stream")
201
    else:
202
        return JSONResponse(content=generator.model_dump())
203
204


Zhuohan Li's avatar
Zhuohan Li committed
205
@app.post("/v1/completions")
206
async def create_completion(request: CompletionRequest, raw_request: Request):
207
208
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
209
210
211
212
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
213
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
214
                                 media_type="text/event-stream")
215
    else:
216
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
217
218
219


if __name__ == "__main__":
220
    args = parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
221
222
223
224
225
226
227
228
229

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    if token := os.environ.get("VLLM_API_KEY") or args.api_key:

        @app.middleware("http")
        async def authentication(request: Request, call_next):
            if not request.url.path.startswith("/v1"):
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
249
250
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
251

252
    logger.info(f"vLLM API server version {vllm.__version__}")
Zhuohan Li's avatar
Zhuohan Li committed
253
254
    logger.info(f"args: {args}")

255
256
257
258
259
    if args.served_model_name is not None:
        served_model = args.served_model_name
    else:
        served_model = args.model

Zhuohan Li's avatar
Zhuohan Li committed
260
    engine_args = AsyncEngineArgs.from_cli_args(args)
261
    engine = AsyncLLMEngine.from_engine_args(engine_args)
262
263
    openai_serving_chat = OpenAIServingChat(engine, served_model,
                                            args.response_role,
264
                                            args.lora_modules,
265
                                            args.chat_template)
266
267
    openai_serving_completion = OpenAIServingCompletion(
        engine, served_model, args.lora_modules)
Zhuohan Li's avatar
Zhuohan Li committed
268

269
    app.root_path = args.root_path
270
271
272
    uvicorn.run(app,
                host=args.host,
                port=args.port,
273
                log_level=args.uvicorn_log_level,
274
275
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
276
277
278
                ssl_certfile=args.ssl_certfile,
                ssl_ca_certs=args.ssl_ca_certs,
                ssl_cert_reqs=args.ssl_cert_reqs)