api_server.py 10.6 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import re
5
6
from contextlib import asynccontextmanager
from http import HTTPStatus
7
from typing import Optional, Set
8

Zhuohan Li's avatar
Zhuohan Li committed
9
import fastapi
10
import uvicorn
Ethan Xu's avatar
Ethan Xu committed
11
from fastapi import APIRouter, Request
Zhuohan Li's avatar
Zhuohan Li committed
12
13
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
14
15
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
16
from starlette.routing import Mount
Zhuohan Li's avatar
Zhuohan Li committed
17

18
import vllm.envs as envs
Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.entrypoints.openai.cli_args import make_arg_parser
22
23
# yapf conflicts with isort for this block
# yapf: disable
24
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
25
                                              ChatCompletionResponse,
26
                                              CompletionRequest,
27
28
29
30
31
32
                                              DetokenizeRequest,
                                              DetokenizeResponse,
                                              EmbeddingRequest, ErrorResponse,
                                              TokenizeRequest,
                                              TokenizeResponse)
# yapf: enable
33
34
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
35
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
36
37
from vllm.entrypoints.openai.serving_tokenization import (
    OpenAIServingTokenization)
38
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
39
from vllm.usage.usage_lib import UsageContext
Ethan Xu's avatar
Ethan Xu committed
40
from vllm.utils import FlexibleArgumentParser
41
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
42

43
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
44

Ethan Xu's avatar
Ethan Xu committed
45
46
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
47
48
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
49
openai_serving_embedding: OpenAIServingEmbedding
50
openai_serving_tokenization: OpenAIServingTokenization
51

52
logger = init_logger('vllm.entrypoints.openai.api_server')
53

54
_running_tasks: Set[asyncio.Task] = set()
55

56

57
58
59
60
61
62
63
64
65
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
66
67
68
        task = asyncio.create_task(_force_log())
        _running_tasks.add(task)
        task.add_done_callback(_running_tasks.remove)
69
70
71
72

    yield


Ethan Xu's avatar
Ethan Xu committed
73
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
74

75
76
77
78
79
80
81

def mount_metrics(app: fastapi.FastAPI):
    # Add prometheus asgi middleware to route /metrics requests
    metrics_route = Mount("/metrics", make_asgi_app())
    # Workaround for 307 Redirect for /metrics
    metrics_route.path_regex = re.compile('^/metrics(?P<path>.*)$')
    app.routes.append(metrics_route)
82
83


Ethan Xu's avatar
Ethan Xu committed
84
@router.get("/health")
85
86
async def health() -> Response:
    """Health check."""
87
    await openai_serving_chat.engine.check_health()
88
89
90
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
91
@router.post("/tokenize")
92
async def tokenize(request: TokenizeRequest):
93
    generator = await openai_serving_tokenization.create_tokenize(request)
94
95
96
97
98
99
100
101
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, TokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
102
@router.post("/detokenize")
103
async def detokenize(request: DetokenizeRequest):
104
    generator = await openai_serving_tokenization.create_detokenize(request)
105
106
107
108
109
110
111
112
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, DetokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
113
@router.get("/v1/models")
Zhuohan Li's avatar
Zhuohan Li committed
114
async def show_available_models():
115
    models = await openai_serving_completion.show_available_models()
116
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
117
118


Ethan Xu's avatar
Ethan Xu committed
119
@router.get("/version")
120
async def show_version():
121
    ver = {"version": VLLM_VERSION}
122
123
124
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
125
@router.post("/v1/chat/completions")
126
127
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
128
129
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
130
131
132
133
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
134
        return StreamingResponse(content=generator,
135
                                 media_type="text/event-stream")
136
    else:
137
        assert isinstance(generator, ChatCompletionResponse)
138
        return JSONResponse(content=generator.model_dump())
139
140


Ethan Xu's avatar
Ethan Xu committed
141
@router.post("/v1/completions")
142
async def create_completion(request: CompletionRequest, raw_request: Request):
143
144
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
145
146
147
148
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
149
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
150
                                 media_type="text/event-stream")
151
    else:
152
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
153
154


Ethan Xu's avatar
Ethan Xu committed
155
@router.post("/v1/embeddings")
156
157
158
159
160
161
162
163
164
165
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
    generator = await openai_serving_embedding.create_embedding(
        request, raw_request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
166
167
168
169
def build_app(args):
    app = fastapi.FastAPI(lifespan=lifespan)
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
170

171
172
    mount_metrics(app)

Zhuohan Li's avatar
Zhuohan Li committed
173
174
175
176
177
178
179
180
    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
181
182
183
184
185
186
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
        err = openai_serving_chat.create_error_response(message=str(exc))
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

187
    if token := envs.VLLM_API_KEY or args.api_key:
188
189
190

        @app.middleware("http")
        async def authentication(request: Request, call_next):
191
            root_path = "" if args.root_path is None else args.root_path
192
193
            if request.method == "OPTIONS":
                return await call_next(request)
194
            if not request.url.path.startswith(f"{root_path}/v1"):
195
196
197
198
199
200
201
202
203
204
205
206
207
208
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
209
210
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
211

Ethan Xu's avatar
Ethan Xu committed
212
213
214
215
216
217
    return app


def run_server(args, llm_engine=None):
    app = build_app(args)

218
    logger.info("vLLM API server version %s", VLLM_VERSION)
219
    logger.info("args: %s", args)
Zhuohan Li's avatar
Zhuohan Li committed
220

221
    if args.served_model_name is not None:
222
        served_model_names = args.served_model_name
223
    else:
224
        served_model_names = [args.model]
225

Ethan Xu's avatar
Ethan Xu committed
226
    global engine, engine_args
227

Ethan Xu's avatar
Ethan Xu committed
228
229
230
231
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

    event_loop: Optional[asyncio.AbstractEventLoop]
    try:
        event_loop = asyncio.get_running_loop()
    except RuntimeError:
        event_loop = None

    if event_loop is not None and event_loop.is_running():
        # If the current is instanced by Ray Serve,
        # there is already a running event loop
        model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
        # When using single vLLM without engine_use_ray
        model_config = asyncio.run(engine.get_model_config())

Ethan Xu's avatar
Ethan Xu committed
247
248
249
    global openai_serving_chat
    global openai_serving_completion
    global openai_serving_embedding
250
    global openai_serving_tokenization
Ethan Xu's avatar
Ethan Xu committed
251

252
253
    openai_serving_chat = OpenAIServingChat(engine, model_config,
                                            served_model_names,
254
                                            args.response_role,
255
                                            args.lora_modules,
256
                                            args.chat_template)
257
    openai_serving_completion = OpenAIServingCompletion(
258
259
        engine, model_config, served_model_names, args.lora_modules,
        args.prompt_adapters)
260
261
    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                      served_model_names)
262
    openai_serving_tokenization = OpenAIServingTokenization(
263
264
        engine, model_config, served_model_names, args.lora_modules,
        args.chat_template)
265
    app.root_path = args.root_path
266
267
268
269
270
271
272
273

    logger.info("Available routes are:")
    for route in app.routes:
        if not hasattr(route, 'methods'):
            continue
        methods = ', '.join(route.methods)
        logger.info("Route: %s, Methods: %s", route.path, methods)

274
275
276
    uvicorn.run(app,
                host=args.host,
                port=args.port,
277
                log_level=args.uvicorn_log_level,
278
279
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
280
281
282
                ssl_certfile=args.ssl_certfile,
                ssl_ca_certs=args.ssl_ca_certs,
                ssl_cert_reqs=args.ssl_cert_reqs)
Ethan Xu's avatar
Ethan Xu committed
283
284
285
286
287
288
289
290
291
292


if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
    run_server(args)