api_server.py 10.2 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import re
5
6
from contextlib import asynccontextmanager
from http import HTTPStatus
7
from typing import Optional, Set
8

Zhuohan Li's avatar
Zhuohan Li committed
9
import fastapi
10
import uvicorn
Ethan Xu's avatar
Ethan Xu committed
11
from fastapi import APIRouter, Request
Zhuohan Li's avatar
Zhuohan Li committed
12
13
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
14
15
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
16
from starlette.routing import Mount
Zhuohan Li's avatar
Zhuohan Li committed
17

18
import vllm.envs as envs
Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.entrypoints.openai.cli_args import make_arg_parser
22
23
# yapf conflicts with isort for this block
# yapf: disable
24
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
25
                                              ChatCompletionResponse,
26
                                              CompletionRequest,
27
28
29
30
31
32
                                              DetokenizeRequest,
                                              DetokenizeResponse,
                                              EmbeddingRequest, ErrorResponse,
                                              TokenizeRequest,
                                              TokenizeResponse)
# yapf: enable
33
34
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
35
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
36
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
37
from vllm.usage.usage_lib import UsageContext
Ethan Xu's avatar
Ethan Xu committed
38
from vllm.utils import FlexibleArgumentParser
39
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
40

41
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
42

Ethan Xu's avatar
Ethan Xu committed
43
44
45
logger = init_logger(__name__)
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
46
47
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
48
49
openai_serving_embedding: OpenAIServingEmbedding

50
logger = init_logger('vllm.entrypoints.openai.api_server')
51

52
_running_tasks: Set[asyncio.Task] = set()
53

54

55
56
57
58
59
60
61
62
63
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
64
65
66
        task = asyncio.create_task(_force_log())
        _running_tasks.add(task)
        task.add_done_callback(_running_tasks.remove)
67
68
69
70

    yield


Ethan Xu's avatar
Ethan Xu committed
71
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
72

73
# Add prometheus asgi middleware to route /metrics requests
74
75
76
route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics
route.path_regex = re.compile('^/metrics(?P<path>.*)$')
Ethan Xu's avatar
Ethan Xu committed
77
router.routes.append(route)
78
79


Ethan Xu's avatar
Ethan Xu committed
80
@router.get("/health")
81
82
async def health() -> Response:
    """Health check."""
83
    await openai_serving_chat.engine.check_health()
84
85
86
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
87
@router.post("/tokenize")
88
89
90
91
92
93
94
95
96
97
async def tokenize(request: TokenizeRequest):
    generator = await openai_serving_completion.create_tokenize(request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, TokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
98
@router.post("/detokenize")
99
100
101
102
103
104
105
106
107
108
async def detokenize(request: DetokenizeRequest):
    generator = await openai_serving_completion.create_detokenize(request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, DetokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
109
@router.get("/v1/models")
Zhuohan Li's avatar
Zhuohan Li committed
110
async def show_available_models():
111
    models = await openai_serving_completion.show_available_models()
112
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
113
114


Ethan Xu's avatar
Ethan Xu committed
115
@router.get("/version")
116
async def show_version():
117
    ver = {"version": VLLM_VERSION}
118
119
120
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
121
@router.post("/v1/chat/completions")
122
123
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
124
125
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
126
127
128
129
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
130
        return StreamingResponse(content=generator,
131
                                 media_type="text/event-stream")
132
    else:
133
        assert isinstance(generator, ChatCompletionResponse)
134
        return JSONResponse(content=generator.model_dump())
135
136


Ethan Xu's avatar
Ethan Xu committed
137
@router.post("/v1/completions")
138
async def create_completion(request: CompletionRequest, raw_request: Request):
139
140
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
141
142
143
144
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
145
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
146
                                 media_type="text/event-stream")
147
    else:
148
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
149
150


Ethan Xu's avatar
Ethan Xu committed
151
@router.post("/v1/embeddings")
152
153
154
155
156
157
158
159
160
161
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
    generator = await openai_serving_embedding.create_embedding(
        request, raw_request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
162
163
164
165
def build_app(args):
    app = fastapi.FastAPI(lifespan=lifespan)
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
166
167
168
169
170
171
172
173
174

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
175
176
177
178
179
180
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
        err = openai_serving_chat.create_error_response(message=str(exc))
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

181
    if token := envs.VLLM_API_KEY or args.api_key:
182
183
184

        @app.middleware("http")
        async def authentication(request: Request, call_next):
185
            root_path = "" if args.root_path is None else args.root_path
186
187
            if request.method == "OPTIONS":
                return await call_next(request)
188
            if not request.url.path.startswith(f"{root_path}/v1"):
189
190
191
192
193
194
195
196
197
198
199
200
201
202
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
203
204
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
205

Ethan Xu's avatar
Ethan Xu committed
206
207
208
209
210
211
    return app


def run_server(args, llm_engine=None):
    app = build_app(args)

212
    logger.info("vLLM API server version %s", VLLM_VERSION)
213
    logger.info("args: %s", args)
Zhuohan Li's avatar
Zhuohan Li committed
214

215
    if args.served_model_name is not None:
216
        served_model_names = args.served_model_name
217
    else:
218
        served_model_names = [args.model]
219

Ethan Xu's avatar
Ethan Xu committed
220
    global engine, engine_args
221

Ethan Xu's avatar
Ethan Xu committed
222
223
224
225
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

    event_loop: Optional[asyncio.AbstractEventLoop]
    try:
        event_loop = asyncio.get_running_loop()
    except RuntimeError:
        event_loop = None

    if event_loop is not None and event_loop.is_running():
        # If the current is instanced by Ray Serve,
        # there is already a running event loop
        model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
        # When using single vLLM without engine_use_ray
        model_config = asyncio.run(engine.get_model_config())

Ethan Xu's avatar
Ethan Xu committed
241
242
243
244
    global openai_serving_chat
    global openai_serving_completion
    global openai_serving_embedding

245
246
    openai_serving_chat = OpenAIServingChat(engine, model_config,
                                            served_model_names,
247
                                            args.response_role,
248
                                            args.lora_modules,
249
                                            args.chat_template)
250
    openai_serving_completion = OpenAIServingCompletion(
251
252
        engine, model_config, served_model_names, args.lora_modules,
        args.prompt_adapters)
253
254
    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                      served_model_names)
255
    app.root_path = args.root_path
256
257
258
259
260
261
262
263

    logger.info("Available routes are:")
    for route in app.routes:
        if not hasattr(route, 'methods'):
            continue
        methods = ', '.join(route.methods)
        logger.info("Route: %s, Methods: %s", route.path, methods)

264
265
266
    uvicorn.run(app,
                host=args.host,
                port=args.port,
267
                log_level=args.uvicorn_log_level,
268
269
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
270
271
272
                ssl_certfile=args.ssl_certfile,
                ssl_ca_certs=args.ssl_ca_certs,
                ssl_cert_reqs=args.ssl_cert_reqs)
Ethan Xu's avatar
Ethan Xu committed
273
274
275
276
277
278
279
280
281
282


if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
    run_server(args)