api_server.py 9.38 KB
Newer Older
Zhuohan Li's avatar
Zhuohan Li committed
1
import argparse
2
import asyncio
Zhuohan Li's avatar
Zhuohan Li committed
3
import json
4
from contextlib import asynccontextmanager
5
6
7
8
import os
import importlib
import inspect

9
from prometheus_client import make_asgi_app
Zhuohan Li's avatar
Zhuohan Li committed
10
import fastapi
11
import uvicorn
12
from http import HTTPStatus
13
from fastapi import Request
Zhuohan Li's avatar
Zhuohan Li committed
14
15
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
16
from fastapi.responses import JSONResponse, StreamingResponse, Response
Zhuohan Li's avatar
Zhuohan Li committed
17

18
import vllm
Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
Woosuk Kwon's avatar
Woosuk Kwon committed
22
from vllm.logger import init_logger
23
24
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
25
from vllm.entrypoints.openai.serving_engine import LoRA
Zhuohan Li's avatar
Zhuohan Li committed
26

27
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
28

29
30
openai_serving_chat: OpenAIServingChat = None
openai_serving_completion: OpenAIServingCompletion = None
Zhuohan Li's avatar
Zhuohan Li committed
31
logger = init_logger(__name__)
32
33


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
        asyncio.create_task(_force_log())

    yield


app = fastapi.FastAPI(lifespan=lifespan)


51
52
53
54
55
56
57
58
59
60
class LoRAParserAction(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        lora_list = []
        for item in values:
            name, path = item.split('=')
            lora_list.append(LoRA(name, path))
        setattr(namespace, self.dest, lora_list)


61
62
63
64
65
def parse_args():
    parser = argparse.ArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser.add_argument("--host", type=str, default=None, help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
66
67
68
69
70
71
    parser.add_argument(
        "--uvicorn-log-level",
        type=str,
        default="info",
        choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
        help="log level for uvicorn")
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    parser.add_argument("--allow-credentials",
                        action="store_true",
                        help="allow credentials")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
                        help="allowed origins")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
                        help="allowed methods")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
                        help="allowed headers")
87
88
89
90
91
92
93
    parser.add_argument(
        "--api-key",
        type=str,
        default=None,
        help=
        "If provided, the server will require this key to be presented in the header."
    )
94
95
96
97
98
99
    parser.add_argument("--served-model-name",
                        type=str,
                        default=None,
                        help="The model name used in the API. If not "
                        "specified, the model name will be the same as "
                        "the huggingface name.")
100
101
102
103
104
105
106
107
108
    parser.add_argument(
        "--lora-modules",
        type=str,
        default=None,
        nargs='+',
        action=LoRAParserAction,
        help=
        "LoRA module configurations in the format name=path. Multiple modules can be specified."
    )
109
110
111
112
113
114
115
116
117
118
119
    parser.add_argument("--chat-template",
                        type=str,
                        default=None,
                        help="The file path to the chat template, "
                        "or the template in single-line form "
                        "for the specified model")
    parser.add_argument("--response-role",
                        type=str,
                        default="assistant",
                        help="The role name to return if "
                        "`request.add_generation_prompt=true`.")
120
121
122
123
124
125
126
127
    parser.add_argument("--ssl-keyfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL key file")
    parser.add_argument("--ssl-certfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL cert file")
128
129
130
131
132
    parser.add_argument(
        "--root-path",
        type=str,
        default=None,
        help="FastAPI root_path when app is behind a path based routing proxy")
133
134
135
136
137
138
139
140
141
142
143
    parser.add_argument(
        "--middleware",
        type=str,
        action="append",
        default=[],
        help="Additional ASGI middleware to apply to the app. "
        "We accept multiple --middleware arguments. "
        "The value should be an import path. "
        "If a function is provided, vLLM will add it to the server using @app.middleware('http'). "
        "If a class is provided, vLLM will add it to the server using app.add_middleware(). "
    )
144
145
146

    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser.parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
147
148


149
150
151
# Add prometheus asgi middleware to route /metrics requests
metrics_app = make_asgi_app()
app.mount("/metrics", metrics_app)
152
153


Zhuohan Li's avatar
Zhuohan Li committed
154
@app.exception_handler(RequestValidationError)
155
async def validation_exception_handler(_, exc):
156
    err = openai_serving_chat.create_error_response(message=str(exc))
157
    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
158
159


160
161
162
@app.get("/health")
async def health() -> Response:
    """Health check."""
163
    await openai_serving_chat.engine.check_health()
164
165
166
    return Response(status_code=200)


Zhuohan Li's avatar
Zhuohan Li committed
167
168
@app.get("/v1/models")
async def show_available_models():
169
    models = await openai_serving_chat.show_available_models()
170
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
171
172


173
174
175
176
177
178
@app.get("/version")
async def show_version():
    ver = {"version": vllm.__version__}
    return JSONResponse(content=ver)


179
@app.post("/v1/chat/completions")
180
181
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
182
183
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
184
185
186
187
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
188
        return StreamingResponse(content=generator,
189
                                 media_type="text/event-stream")
190
    else:
191
        return JSONResponse(content=generator.model_dump())
192
193


Zhuohan Li's avatar
Zhuohan Li committed
194
@app.post("/v1/completions")
195
async def create_completion(request: CompletionRequest, raw_request: Request):
196
197
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
198
199
200
201
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
202
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
203
                                 media_type="text/event-stream")
204
    else:
205
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
206
207
208


if __name__ == "__main__":
209
    args = parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
210
211
212
213
214
215
216
217
218

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
    if token := os.environ.get("VLLM_API_KEY") or args.api_key:

        @app.middleware("http")
        async def authentication(request: Request, call_next):
            if not request.url.path.startswith("/v1"):
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
            raise ValueError(
                f"Invalid middleware {middleware}. Must be a function or a class."
            )

242
    logger.info(f"vLLM API server version {vllm.__version__}")
Zhuohan Li's avatar
Zhuohan Li committed
243
244
    logger.info(f"args: {args}")

245
246
247
248
249
    if args.served_model_name is not None:
        served_model = args.served_model_name
    else:
        served_model = args.model

Zhuohan Li's avatar
Zhuohan Li committed
250
    engine_args = AsyncEngineArgs.from_cli_args(args)
251
    engine = AsyncLLMEngine.from_engine_args(engine_args)
252
253
    openai_serving_chat = OpenAIServingChat(engine, served_model,
                                            args.response_role,
254
                                            args.lora_modules,
255
                                            args.chat_template)
256
257
    openai_serving_completion = OpenAIServingCompletion(
        engine, served_model, args.lora_modules)
Zhuohan Li's avatar
Zhuohan Li committed
258

259
    app.root_path = args.root_path
260
261
262
    uvicorn.run(app,
                host=args.host,
                port=args.port,
263
                log_level=args.uvicorn_log_level,
264
265
266
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
                ssl_certfile=args.ssl_certfile)