api_server.py 10.5 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import re
5
6
from contextlib import asynccontextmanager
from http import HTTPStatus
7
from typing import Optional, Set
8

Zhuohan Li's avatar
Zhuohan Li committed
9
import fastapi
10
import uvicorn
Ethan Xu's avatar
Ethan Xu committed
11
from fastapi import APIRouter, Request
Zhuohan Li's avatar
Zhuohan Li committed
12
13
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
14
15
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
16
from starlette.routing import Mount
Zhuohan Li's avatar
Zhuohan Li committed
17

18
import vllm.envs as envs
Woosuk Kwon's avatar
Woosuk Kwon committed
19
20
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
21
from vllm.entrypoints.openai.cli_args import make_arg_parser
22
23
# yapf conflicts with isort for this block
# yapf: disable
24
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
25
                                              ChatCompletionResponse,
26
                                              CompletionRequest,
27
28
29
30
31
32
                                              DetokenizeRequest,
                                              DetokenizeResponse,
                                              EmbeddingRequest, ErrorResponse,
                                              TokenizeRequest,
                                              TokenizeResponse)
# yapf: enable
33
34
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
35
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
36
37
from vllm.entrypoints.openai.serving_tokenization import (
    OpenAIServingTokenization)
38
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
39
from vllm.usage.usage_lib import UsageContext
Ethan Xu's avatar
Ethan Xu committed
40
from vllm.utils import FlexibleArgumentParser
41
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
42

43
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
44

Ethan Xu's avatar
Ethan Xu committed
45
46
47
logger = init_logger(__name__)
engine: AsyncLLMEngine
engine_args: AsyncEngineArgs
48
49
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
50
openai_serving_embedding: OpenAIServingEmbedding
51
openai_serving_tokenization: OpenAIServingTokenization
52

53
logger = init_logger('vllm.entrypoints.openai.api_server')
54

55
_running_tasks: Set[asyncio.Task] = set()
56

57

58
59
60
61
62
63
64
65
66
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):

    async def _force_log():
        while True:
            await asyncio.sleep(10)
            await engine.do_log_stats()

    if not engine_args.disable_log_stats:
67
68
69
        task = asyncio.create_task(_force_log())
        _running_tasks.add(task)
        task.add_done_callback(_running_tasks.remove)
70
71
72
73

    yield


Ethan Xu's avatar
Ethan Xu committed
74
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
75

76
# Add prometheus asgi middleware to route /metrics requests
77
78
79
route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics
route.path_regex = re.compile('^/metrics(?P<path>.*)$')
Ethan Xu's avatar
Ethan Xu committed
80
router.routes.append(route)
81
82


Ethan Xu's avatar
Ethan Xu committed
83
@router.get("/health")
84
85
async def health() -> Response:
    """Health check."""
86
    await openai_serving_chat.engine.check_health()
87
88
89
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
90
@router.post("/tokenize")
91
async def tokenize(request: TokenizeRequest):
92
    generator = await openai_serving_tokenization.create_tokenize(request)
93
94
95
96
97
98
99
100
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, TokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
101
@router.post("/detokenize")
102
async def detokenize(request: DetokenizeRequest):
103
    generator = await openai_serving_tokenization.create_detokenize(request)
104
105
106
107
108
109
110
111
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        assert isinstance(generator, DetokenizeResponse)
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
112
@router.get("/v1/models")
Zhuohan Li's avatar
Zhuohan Li committed
113
async def show_available_models():
114
    models = await openai_serving_completion.show_available_models()
115
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
116
117


Ethan Xu's avatar
Ethan Xu committed
118
@router.get("/version")
119
async def show_version():
120
    ver = {"version": VLLM_VERSION}
121
122
123
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
124
@router.post("/v1/chat/completions")
125
126
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
127
128
    generator = await openai_serving_chat.create_chat_completion(
        request, raw_request)
129
130
131
132
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
133
        return StreamingResponse(content=generator,
134
                                 media_type="text/event-stream")
135
    else:
136
        assert isinstance(generator, ChatCompletionResponse)
137
        return JSONResponse(content=generator.model_dump())
138
139


Ethan Xu's avatar
Ethan Xu committed
140
@router.post("/v1/completions")
141
async def create_completion(request: CompletionRequest, raw_request: Request):
142
143
    generator = await openai_serving_completion.create_completion(
        request, raw_request)
144
145
146
147
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    if request.stream:
148
        return StreamingResponse(content=generator,
Zhuohan Li's avatar
Zhuohan Li committed
149
                                 media_type="text/event-stream")
150
    else:
151
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
152
153


Ethan Xu's avatar
Ethan Xu committed
154
@router.post("/v1/embeddings")
155
156
157
158
159
160
161
162
163
164
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
    generator = await openai_serving_embedding.create_embedding(
        request, raw_request)
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
    else:
        return JSONResponse(content=generator.model_dump())


Ethan Xu's avatar
Ethan Xu committed
165
166
167
168
def build_app(args):
    app = fastapi.FastAPI(lifespan=lifespan)
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
169
170
171
172
173
174
175
176
177

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
178
179
180
181
182
183
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
        err = openai_serving_chat.create_error_response(message=str(exc))
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

184
    if token := envs.VLLM_API_KEY or args.api_key:
185
186
187

        @app.middleware("http")
        async def authentication(request: Request, call_next):
188
            root_path = "" if args.root_path is None else args.root_path
189
190
            if request.method == "OPTIONS":
                return await call_next(request)
191
            if not request.url.path.startswith(f"{root_path}/v1"):
192
193
194
195
196
197
198
199
200
201
202
203
204
205
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
206
207
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
208

Ethan Xu's avatar
Ethan Xu committed
209
210
211
212
213
214
    return app


def run_server(args, llm_engine=None):
    app = build_app(args)

215
    logger.info("vLLM API server version %s", VLLM_VERSION)
216
    logger.info("args: %s", args)
Zhuohan Li's avatar
Zhuohan Li committed
217

218
    if args.served_model_name is not None:
219
        served_model_names = args.served_model_name
220
    else:
221
        served_model_names = [args.model]
222

Ethan Xu's avatar
Ethan Xu committed
223
    global engine, engine_args
224

Ethan Xu's avatar
Ethan Xu committed
225
226
227
228
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = (llm_engine
              if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                  engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

    event_loop: Optional[asyncio.AbstractEventLoop]
    try:
        event_loop = asyncio.get_running_loop()
    except RuntimeError:
        event_loop = None

    if event_loop is not None and event_loop.is_running():
        # If the current is instanced by Ray Serve,
        # there is already a running event loop
        model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
        # When using single vLLM without engine_use_ray
        model_config = asyncio.run(engine.get_model_config())

Ethan Xu's avatar
Ethan Xu committed
244
245
246
    global openai_serving_chat
    global openai_serving_completion
    global openai_serving_embedding
247
    global openai_serving_tokenization
Ethan Xu's avatar
Ethan Xu committed
248

249
250
    openai_serving_chat = OpenAIServingChat(engine, model_config,
                                            served_model_names,
251
                                            args.response_role,
252
                                            args.lora_modules,
253
                                            args.chat_template)
254
    openai_serving_completion = OpenAIServingCompletion(
255
256
        engine, model_config, served_model_names, args.lora_modules,
        args.prompt_adapters)
257
258
    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                      served_model_names)
259
260
    openai_serving_tokenization = OpenAIServingTokenization(
        engine, model_config, served_model_names, args.chat_template)
261
    app.root_path = args.root_path
262
263
264
265
266
267
268
269

    logger.info("Available routes are:")
    for route in app.routes:
        if not hasattr(route, 'methods'):
            continue
        methods = ', '.join(route.methods)
        logger.info("Route: %s, Methods: %s", route.path, methods)

270
271
272
    uvicorn.run(app,
                host=args.host,
                port=args.port,
273
                log_level=args.uvicorn_log_level,
274
275
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                ssl_keyfile=args.ssl_keyfile,
276
277
278
                ssl_certfile=args.ssl_certfile,
                ssl_ca_certs=args.ssl_ca_certs,
                ssl_cert_reqs=args.ssl_cert_reqs)
Ethan Xu's avatar
Ethan Xu committed
279
280
281
282
283
284
285
286
287
288


if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
    run_server(args)