api_server.py 9.11 KB
Newer Older
Zhuohan Li's avatar
Zhuohan Li committed
1
import argparse
2
import asyncio
Zhuohan Li's avatar
Zhuohan Li committed
3
import json
4
from contextlib import asynccontextmanager
5
6
7
8
import os
import importlib
import inspect

9
10
from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
Zhuohan Li's avatar
Zhuohan Li committed
11
import fastapi
12
import uvicorn
13
from http import HTTPStatus
14
from fastapi import Request
Zhuohan Li's avatar
Zhuohan Li committed
15
16
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
17
from fastapi.responses import JSONResponse, StreamingResponse, Response
Zhuohan Li's avatar
Zhuohan Li committed
18

Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.engine.metrics import add_global_metrics_labels
22
from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
Woosuk Kwon's avatar
Woosuk Kwon committed
23
from vllm.logger import init_logger
24
25
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
26
from vllm.entrypoints.openai.serving_engine import LoRA
Zhuohan Li's avatar
Zhuohan Li committed
27

28
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
29

30
31
openai_serving_chat: OpenAIServingChat = None
openai_serving_completion: OpenAIServingCompletion = None
Zhuohan Li's avatar
Zhuohan Li committed
32
logger = init_logger(__name__)
33
34


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
        asyncio.create_task(_force_log())

    yield


app = fastapi.FastAPI(lifespan=lifespan)


52
53
54
55
56
57
58
59
60
61
class LoRAParserAction(argparse.Action):

    def __call__(self, parser, namespace, values, option_string=None):
        lora_list = []
        for item in values:
            name, path = item.split('=')
            lora_list.append(LoRA(name, path))
        setattr(namespace, self.dest, lora_list)


62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def parse_args():
    parser = argparse.ArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser.add_argument("--host", type=str, default=None, help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
    parser.add_argument("--allow-credentials",
                        action="store_true",
                        help="allow credentials")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
                        help="allowed origins")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
                        help="allowed methods")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
                        help="allowed headers")
82
83
84
85
86
87
88
    parser.add_argument(
        "--api-key",
        type=str,
        default=None,
        help=
        "If provided, the server will require this key to be presented in the header."
    )
89
90
91
92
93
94
    parser.add_argument("--served-model-name",
                        type=str,
                        default=None,
                        help="The model name used in the API. If not "
                        "specified, the model name will be the same as "
                        "the huggingface name.")
95
96
97
98
99
100
101
102
103
    parser.add_argument(
        "--lora-modules",
        type=str,
        default=None,
        nargs='+',
        action=LoRAParserAction,
        help=
        "LoRA module configurations in the format name=path. Multiple modules can be specified."
    )
104
105
106
107
108
109
110
111
112
113
114
    parser.add_argument("--chat-template",
                        type=str,
                        default=None,
                        help="The file path to the chat template, "
                        "or the template in single-line form "
                        "for the specified model")
    parser.add_argument("--response-role",
                        type=str,
                        default="assistant",
                        help="The role name to return if "
                        "`request.add_generation_prompt=true`.")
115
116
117
118
119
120
121
122
    parser.add_argument("--ssl-keyfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL key file")
    parser.add_argument("--ssl-certfile",
                        type=str,
                        default=None,
                        help="The file path to the SSL cert file")
123
124
125
126
127
    parser.add_argument(
        "--root-path",
        type=str,
        default=None,
        help="FastAPI root_path when app is behind a path based routing proxy")
128
129
130
131
132
133
134
135
136
137
138
    parser.add_argument(
        "--middleware",
        type=str,
        action="append",
        default=[],
        help="Additional ASGI middleware to apply to the app. "
        "We accept multiple --middleware arguments. "
        "The value should be an import path. "
        "If a function is provided, vLLM will add it to the server using @app.middleware('http'). "
        "If a class is provided, vLLM will add it to the server using app.add_middleware(). "
    )
139
140
141

    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser.parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
142
143


144
145
146
147
app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
app.add_route("/metrics", metrics)  # Exposes HTTP metrics


Zhuohan Li's avatar
Zhuohan Li committed
148
@app.exception_handler(RequestValidationError)
149
async def validation_exception_handler(_, exc):
150
    err = openai_serving_chat.create_error_response(message=str(exc))
151
    return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
152
153


154
155
156
157
158
159
@app.get("/health")
async def health() -> Response:
    """Health check."""
    return Response(status_code=200)


Zhuohan Li's avatar
Zhuohan Li committed
160
161
@app.get("/v1/models")
async def show_available_models():
162
    models = await openai_serving_chat.show_available_models()
163
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
164
165


166
@app.post("/v1/chat/completions")
167
168
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
169
170
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
171
172
173
174
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
175
        return StreamingResponse(content=generator,
176
                                 media_type="text/event-stream")
177
    else:
178
        return JSONResponse(content=generator.model_dump())
179
180


Zhuohan Li's avatar
Zhuohan Li committed
181
@app.post("/v1/completions")
182
async def create_completion(request: CompletionRequest, raw_request: Request):
183
184
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
185
186
187
188
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
189
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
190
                                 media_type="text/event-stream")
191
    else:
192
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
193
194
195


if __name__ == "__main__":
196
    args = parse_args()
Zhuohan Li's avatar
Zhuohan Li committed
197
198
199
200
201
202
203
204
205

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
    if token := os.environ.get("VLLM_API_KEY") or args.api_key:

        @app.middleware("http")
        async def authentication(request: Request, call_next):
            if not request.url.path.startswith("/v1"):
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
            raise ValueError(
                f"Invalid middleware {middleware}. Must be a function or a class."
            )

Zhuohan Li's avatar
Zhuohan Li committed
229
230
    logger.info(f"args: {args}")

231
232
233
234
235
    if args.served_model_name is not None:
        served_model = args.served_model_name
    else:
        served_model = args.model

Zhuohan Li's avatar
Zhuohan Li committed
236
    engine_args = AsyncEngineArgs.from_cli_args(args)
237
    engine = AsyncLLMEngine.from_engine_args(engine_args)
238
239
    openai_serving_chat = OpenAIServingChat(engine, served_model,
                                            args.response_role,
240
                                            args.lora_modules,
241
                                            args.chat_template)
242
243
    openai_serving_completion = OpenAIServingCompletion(
        engine, served_model, args.lora_modules)
Zhuohan Li's avatar
Zhuohan Li committed
244

245
246
247
    # Register labels for metrics
    add_global_metrics_labels(model_name=engine_args.model)

248
    app.root_path = args.root_path
249
250
251
252
    uvicorn.run(app,
                host=args.host,
                port=args.port,
                log_level="info",
253
254
255
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
                ssl_certfile=args.ssl_certfile)