api_server.py 23.5 KB
Newer Older
1
import asyncio
2
3
import importlib
import inspect
4
import multiprocessing
5
import os
6
import re
7
import signal
8
import socket
9
import tempfile
10
import uuid
11
from argparse import Namespace
12
from contextlib import asynccontextmanager
13
from functools import partial
14
from http import HTTPStatus
15
from typing import AsyncIterator, Optional, Set
16

17
import uvloop
18
from fastapi import APIRouter, FastAPI, Request
Zhuohan Li's avatar
Zhuohan Li committed
19
20
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
21
from fastapi.responses import JSONResponse, Response, StreamingResponse
22
from starlette.datastructures import State
23
from starlette.routing import Mount
24
from typing_extensions import assert_never
Zhuohan Li's avatar
Zhuohan Li committed
25

26
import vllm.envs as envs
27
from vllm.config import ModelConfig
Woosuk Kwon's avatar
Woosuk Kwon committed
28
from vllm.engine.arg_utils import AsyncEngineArgs
29
30
31
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.engine.multiprocessing.engine import run_mp_engine
from vllm.engine.protocol import EngineClient
32
from vllm.entrypoints.launcher import serve_http
33
from vllm.entrypoints.logger import RequestLogger
34
35
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                              validate_parsed_serve_args)
36
37
# yapf conflicts with isort for this block
# yapf: disable
38
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
39
                                              ChatCompletionResponse,
40
                                              CompletionRequest,
41
                                              CompletionResponse,
42
43
                                              DetokenizeRequest,
                                              DetokenizeResponse,
44
45
                                              EmbeddingRequest,
                                              EmbeddingResponse, ErrorResponse,
46
                                              LoadLoraAdapterRequest,
47
                                              TokenizeRequest,
48
49
50
                                              TokenizeResponse,
                                              UnloadLoraAdapterRequest)
# yapf: enable
51
52
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
53
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
54
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
55
56
from vllm.entrypoints.openai.serving_tokenization import (
    OpenAIServingTokenization)
57
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
58
from vllm.logger import init_logger
yhu422's avatar
yhu422 committed
59
from vllm.usage.usage_lib import UsageContext
60
from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
61
from vllm.version import __version__ as VLLM_VERSION
Zhuohan Li's avatar
Zhuohan Li committed
62

63
64
65
66
67
if envs.VLLM_USE_V1:
    from vllm.v1.engine.async_llm import AsyncLLMEngine  # type: ignore
else:
    from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore

68
TIMEOUT_KEEP_ALIVE = 5  # seconds
Zhuohan Li's avatar
Zhuohan Li committed
69

70
prometheus_multiproc_dir: tempfile.TemporaryDirectory
71

72
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
73
logger = init_logger('vllm.entrypoints.openai.api_server')
74

75
_running_tasks: Set[asyncio.Task] = set()
76

77

78
@asynccontextmanager
79
async def lifespan(app: FastAPI):
80
81
    try:
        if app.state.log_stats:
82
            engine_client: EngineClient = app.state.engine_client
83
84
85

            async def _force_log():
                while True:
86
87
                    await asyncio.sleep(10.)
                    await engine_client.do_log_stats()
88
89
90
91
92
93
94
95
96
97
98
99
100
101

            task = asyncio.create_task(_force_log())
            _running_tasks.add(task)
            task.add_done_callback(_running_tasks.remove)
        else:
            task = None
        try:
            yield
        finally:
            if task is not None:
                task.cancel()
    finally:
        # Ensure app state including engine ref is gc'd
        del app.state
102
103


104
@asynccontextmanager
105
async def build_async_engine_client(
106
        args: Namespace) -> AsyncIterator[EngineClient]:
107

108
    # Context manager to handle engine_client lifecycle
109
110
111
    # Ensures everything is shutdown and cleaned up on error/exit
    engine_args = AsyncEngineArgs.from_cli_args(args)

112
113
114
115
116
117
118
119
120
    async with build_async_engine_client_from_engine_args(
            engine_args, args.disable_frontend_multiprocessing) as engine:
        yield engine


@asynccontextmanager
async def build_async_engine_client_from_engine_args(
    engine_args: AsyncEngineArgs,
    disable_frontend_multiprocessing: bool = False,
121
) -> AsyncIterator[EngineClient]:
122
    """
123
    Create EngineClient, either:
124
125
126
127
128
129
        - in-process using the AsyncLLMEngine Directly
        - multiprocess using AsyncLLMEngine RPC

    Returns the Client or None if the creation failed.
    """

130
131
132
    # Fall back
    # TODO: fill out feature matrix.
    if (MQLLMEngineClient.is_unsupported_config(engine_args)
133
134
            or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
        engine_config = engine_args.create_engine_config()
        uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
                           "uses_ray", False)

        build_engine = partial(AsyncLLMEngine.from_engine_args,
                               engine_args=engine_args,
                               engine_config=engine_config,
                               usage_context=UsageContext.OPENAI_API_SERVER)
        if uses_ray:
            # Must run in main thread with ray for its signal handlers to work
            engine_client = build_engine()
        else:
            engine_client = await asyncio.get_running_loop().run_in_executor(
                None, build_engine)

        yield engine_client
151
152
        if hasattr(engine_client, "shutdown"):
            engine_client.shutdown()
153
154
155
156
        return

    # Otherwise, use the multiprocessing AsyncLLMEngine.
    else:
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
            # Make TemporaryDirectory for prometheus multiprocessing
            # Note: global TemporaryDirectory will be automatically
            #   cleaned up upon exit.
            global prometheus_multiproc_dir
            prometheus_multiproc_dir = tempfile.TemporaryDirectory()
            os.environ[
                "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
        else:
            logger.warning(
                "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
                "This directory must be wiped between vLLM runs or "
                "you will find inaccurate metrics. Unset the variable "
                "and vLLM will properly handle cleanup.")

172
        # Select random path for IPC.
173
174
175
        ipc_path = get_open_zmq_ipc_path()
        logger.info("Multiprocessing frontend to use %s for IPC Path.",
                    ipc_path)
176

177
        # Start RPCServer in separate process (holds the LLMEngine).
178
179
        # the current process might have CUDA context,
        # so we need to spawn a new process
180
181
        context = multiprocessing.get_context("spawn")

182
183
184
185
        # The Process can raise an exception during startup, which may
        # not actually result in an exitcode being reported. As a result
        # we use a shared variable to communicate the information.
        engine_alive = multiprocessing.Value('b', True, lock=False)
186
187
188
        engine_process = context.Process(target=run_mp_engine,
                                         args=(engine_args,
                                               UsageContext.OPENAI_API_SERVER,
189
                                               ipc_path, engine_alive))
190
        engine_process.start()
191
        engine_pid = engine_process.pid
192
        assert engine_pid is not None, "Engine process failed to start."
193
        logger.info("Started engine process with PID %d", engine_pid)
194
195
196

        # Build RPCClient, which conforms to EngineClient Protocol.
        engine_config = engine_args.create_engine_config()
197
198
199
200
        build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
                               engine_pid)
        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
            None, build_client)
201
        try:
202
203
            while True:
                try:
204
                    await mq_engine_client.setup()
205
                    break
206
                except TimeoutError:
207
208
                    if (not engine_process.is_alive()
                            or not engine_alive.value):
209
                        raise RuntimeError(
210
211
                            "Engine process failed to start. See stack "
                            "trace for the root cause.") from None
212

213
            yield mq_engine_client  # type: ignore[misc]
214
215
        finally:
            # Ensure rpc server process was terminated
216
            engine_process.terminate()
217
218

            # Close all open connections to the backend
219
            mq_engine_client.close()
220

221
222
223
224
225
            # Wait for engine process to join
            engine_process.join(4)
            if engine_process.exitcode is None:
                # Kill if taking longer than 5 seconds to stop
                engine_process.kill()
226

227
228
229
230
231
            # Lazy import for prometheus multiprocessing.
            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
            # before prometheus_client is imported.
            # See https://prometheus.github.io/client_python/multiprocess/
            from prometheus_client import multiprocess
232
            multiprocess.mark_process_dead(engine_process.pid)
233

234

Ethan Xu's avatar
Ethan Xu committed
235
router = APIRouter()
Zhuohan Li's avatar
Zhuohan Li committed
236

237

238
def mount_metrics(app: FastAPI):
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
    # Lazy import for prometheus multiprocessing.
    # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
    # before prometheus_client is imported.
    # See https://prometheus.github.io/client_python/multiprocess/
    from prometheus_client import (CollectorRegistry, make_asgi_app,
                                   multiprocess)

    prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
    if prometheus_multiproc_dir_path is not None:
        logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
                    prometheus_multiproc_dir_path)
        registry = CollectorRegistry()
        multiprocess.MultiProcessCollector(registry)

        # Add prometheus asgi middleware to route /metrics requests
        metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
    else:
        # Add prometheus asgi middleware to route /metrics requests
        metrics_route = Mount("/metrics", make_asgi_app())

259
    # Workaround for 307 Redirect for /metrics
260
    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
261
    app.routes.append(metrics_route)
262
263


264
265
266
267
268
269
def base(request: Request) -> OpenAIServing:
    # Reuse the existing instance
    return tokenization(request)


def chat(request: Request) -> Optional[OpenAIServingChat]:
270
271
272
    return request.app.state.openai_serving_chat


273
def completion(request: Request) -> Optional[OpenAIServingCompletion]:
274
275
276
    return request.app.state.openai_serving_completion


277
278
def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
    return request.app.state.openai_serving_embedding
279
280


281
282
def tokenization(request: Request) -> OpenAIServingTokenization:
    return request.app.state.openai_serving_tokenization
283
284


285
def engine_client(request: Request) -> EngineClient:
286
287
288
    return request.app.state.engine_client


Ethan Xu's avatar
Ethan Xu committed
289
@router.get("/health")
290
async def health(raw_request: Request) -> Response:
291
    """Health check."""
292
    await engine_client(raw_request).check_health()
293
294
295
    return Response(status_code=200)


Ethan Xu's avatar
Ethan Xu committed
296
@router.post("/tokenize")
297
async def tokenize(request: TokenizeRequest, raw_request: Request):
298
299
300
    handler = tokenization(raw_request)

    generator = await handler.create_tokenize(request)
301
302
303
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
304
    elif isinstance(generator, TokenizeResponse):
305
306
        return JSONResponse(content=generator.model_dump())

307
308
    assert_never(generator)

309

Ethan Xu's avatar
Ethan Xu committed
310
@router.post("/detokenize")
311
async def detokenize(request: DetokenizeRequest, raw_request: Request):
312
313
314
    handler = tokenization(raw_request)

    generator = await handler.create_detokenize(request)
315
316
317
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
318
    elif isinstance(generator, DetokenizeResponse):
319
320
        return JSONResponse(content=generator.model_dump())

321
322
    assert_never(generator)

323

Ethan Xu's avatar
Ethan Xu committed
324
@router.get("/v1/models")
325
async def show_available_models(raw_request: Request):
326
327
328
    handler = base(raw_request)

    models = await handler.show_available_models()
329
    return JSONResponse(content=models.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
330
331


Ethan Xu's avatar
Ethan Xu committed
332
@router.get("/version")
333
async def show_version():
334
    ver = {"version": VLLM_VERSION}
335
336
337
    return JSONResponse(content=ver)


Ethan Xu's avatar
Ethan Xu committed
338
@router.post("/v1/chat/completions")
339
340
async def create_chat_completion(request: ChatCompletionRequest,
                                 raw_request: Request):
341
342
343
344
    handler = chat(raw_request)
    if handler is None:
        return base(raw_request).create_error_response(
            message="The model does not support Chat Completions API")
345

346
    generator = await handler.create_chat_completion(request, raw_request)
347

348
349
350
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
351

352
    elif isinstance(generator, ChatCompletionResponse):
353
        return JSONResponse(content=generator.model_dump())
354

355
356
    return StreamingResponse(content=generator, media_type="text/event-stream")

357

Ethan Xu's avatar
Ethan Xu committed
358
@router.post("/v1/completions")
359
async def create_completion(request: CompletionRequest, raw_request: Request):
360
361
362
363
364
365
    handler = completion(raw_request)
    if handler is None:
        return base(raw_request).create_error_response(
            message="The model does not support Completions API")

    generator = await handler.create_completion(request, raw_request)
366
367
368
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
369
    elif isinstance(generator, CompletionResponse):
370
        return JSONResponse(content=generator.model_dump())
Zhuohan Li's avatar
Zhuohan Li committed
371

372
373
    return StreamingResponse(content=generator, media_type="text/event-stream")

Zhuohan Li's avatar
Zhuohan Li committed
374

Ethan Xu's avatar
Ethan Xu committed
375
@router.post("/v1/embeddings")
376
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
377
378
379
380
381
382
    handler = embedding(raw_request)
    if handler is None:
        return base(raw_request).create_error_response(
            message="The model does not support Embeddings API")

    generator = await handler.create_embedding(request, raw_request)
383
384
385
    if isinstance(generator, ErrorResponse):
        return JSONResponse(content=generator.model_dump(),
                            status_code=generator.code)
386
    elif isinstance(generator, EmbeddingResponse):
387
388
        return JSONResponse(content=generator.model_dump())

389
390
    assert_never(generator)

391

392
393
394
395
396
397
if envs.VLLM_TORCH_PROFILER_DIR:
    logger.warning(
        "Torch Profiler is enabled in the API server. This should ONLY be "
        "used for local development!")

    @router.post("/start_profile")
398
    async def start_profile(raw_request: Request):
399
        logger.info("Starting profiler...")
400
        await engine_client(raw_request).start_profile()
401
402
403
404
        logger.info("Profiler started.")
        return Response(status_code=200)

    @router.post("/stop_profile")
405
    async def stop_profile(raw_request: Request):
406
        logger.info("Stopping profiler...")
407
        await engine_client(raw_request).stop_profile()
408
409
410
411
        logger.info("Profiler stopped.")
        return Response(status_code=200)


412
413
414
415
416
417
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
    logger.warning(
        "Lora dynamic loading & unloading is enabled in the API server. "
        "This should ONLY be used for local development!")

    @router.post("/v1/load_lora_adapter")
418
419
    async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                raw_request: Request):
420
421
422
423
424
425
426
        for route in [chat, completion, embedding]:
            handler = route(raw_request)
            if handler is not None:
                response = await handler.load_lora_adapter(request)
                if isinstance(response, ErrorResponse):
                    return JSONResponse(content=response.model_dump(),
                                        status_code=response.code)
427
428
429
430

        return Response(status_code=200, content=response)

    @router.post("/v1/unload_lora_adapter")
431
432
    async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                  raw_request: Request):
433
434
435
436
437
438
439
        for route in [chat, completion, embedding]:
            handler = route(raw_request)
            if handler is not None:
                response = await handler.unload_lora_adapter(request)
                if isinstance(response, ErrorResponse):
                    return JSONResponse(content=response.model_dump(),
                                        status_code=response.code)
440
441
442
443

        return Response(status_code=200, content=response)


444
def build_app(args: Namespace) -> FastAPI:
445
446
447
448
449
450
451
    if args.disable_fastapi_docs:
        app = FastAPI(openapi_url=None,
                      docs_url=None,
                      redoc_url=None,
                      lifespan=lifespan)
    else:
        app = FastAPI(lifespan=lifespan)
Ethan Xu's avatar
Ethan Xu committed
452
453
    app.include_router(router)
    app.root_path = args.root_path
Zhuohan Li's avatar
Zhuohan Li committed
454

455
456
    mount_metrics(app)

Zhuohan Li's avatar
Zhuohan Li committed
457
458
459
460
461
462
463
464
    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

Ethan Xu's avatar
Ethan Xu committed
465
466
    @app.exception_handler(RequestValidationError)
    async def validation_exception_handler(_, exc):
467
468
        chat = app.state.openai_serving_chat
        err = chat.create_error_response(message=str(exc))
Ethan Xu's avatar
Ethan Xu committed
469
470
471
        return JSONResponse(err.model_dump(),
                            status_code=HTTPStatus.BAD_REQUEST)

472
    if token := envs.VLLM_API_KEY or args.api_key:
473
474
475

        @app.middleware("http")
        async def authentication(request: Request, call_next):
476
            root_path = "" if args.root_path is None else args.root_path
477
478
            if request.method == "OPTIONS":
                return await call_next(request)
479
            if not request.url.path.startswith(f"{root_path}/v1"):
480
481
482
483
484
485
                return await call_next(request)
            if request.headers.get("Authorization") != "Bearer " + token:
                return JSONResponse(content={"error": "Unauthorized"},
                                    status_code=401)
            return await call_next(request)

486
487
488
489
490
491
492
    @app.middleware("http")
    async def add_request_id(request: Request, call_next):
        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
        response = await call_next(request)
        response.headers["X-Request-Id"] = request_id
        return response

493
494
495
496
497
498
499
500
    for middleware in args.middleware:
        module_path, object_name = middleware.rsplit(".", 1)
        imported = getattr(importlib.import_module(module_path), object_name)
        if inspect.isclass(imported):
            app.add_middleware(imported)
        elif inspect.iscoroutinefunction(imported):
            app.middleware("http")(imported)
        else:
501
502
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")
503

Ethan Xu's avatar
Ethan Xu committed
504
505
506
    return app


507
def init_app_state(
508
    engine_client: EngineClient,
509
510
    model_config: ModelConfig,
    state: State,
511
    args: Namespace,
512
) -> None:
513
    if args.served_model_name is not None:
514
        served_model_names = args.served_model_name
515
    else:
516
        served_model_names = [args.model]
517

518
519
520
521
522
    if args.disable_log_requests:
        request_logger = None
    else:
        request_logger = RequestLogger(max_log_len=args.max_log_len)

523
524
525
526
527
    base_model_paths = [
        BaseModelPath(name=name, model_path=args.model)
        for name in served_model_names
    ]

528
    state.engine_client = engine_client
529
    state.log_stats = not args.disable_log_stats
Ethan Xu's avatar
Ethan Xu committed
530

531
    state.openai_serving_chat = OpenAIServingChat(
532
        engine_client,
533
        model_config,
534
        base_model_paths,
535
536
537
538
539
        args.response_role,
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
        chat_template=args.chat_template,
540
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
541
        enable_auto_tools=args.enable_auto_tool_choice,
542
        tool_parser=args.tool_call_parser,
543
        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
544
    ) if model_config.task == "generate" else None
545
    state.openai_serving_completion = OpenAIServingCompletion(
546
        engine_client,
547
        model_config,
548
        base_model_paths,
549
550
551
        lora_modules=args.lora_modules,
        prompt_adapters=args.prompt_adapters,
        request_logger=request_logger,
552
        return_tokens_as_token_ids=args.return_tokens_as_token_ids,
553
    ) if model_config.task == "generate" else None
554
    state.openai_serving_embedding = OpenAIServingEmbedding(
555
        engine_client,
556
        model_config,
557
        base_model_paths,
558
        request_logger=request_logger,
559
560
        chat_template=args.chat_template,
    ) if model_config.task == "embedding" else None
561
    state.openai_serving_tokenization = OpenAIServingTokenization(
562
        engine_client,
563
        model_config,
564
        base_model_paths,
565
566
567
568
        lora_modules=args.lora_modules,
        request_logger=request_logger,
        chat_template=args.chat_template,
    )
569
570


571
async def run_server(args, **uvicorn_kwargs) -> None:
572
573
574
    logger.info("vLLM API server version %s", VLLM_VERSION)
    logger.info("args: %s", args)

575
576
577
578
579
580
581
582
583
    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
        ToolParserManager.import_tool_parser(args.tool_parser_plugin)

    valide_tool_parses = ToolParserManager.tool_parsers.keys()
    if args.enable_auto_tool_choice \
        and args.tool_call_parser not in valide_tool_parses:
        raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
                       f"(chose from {{ {','.join(valide_tool_parses)} }})")

584
585
586
587
    # workaround to make sure that we bind the port before the engine is set up.
    # This avoids race conditions with ray.
    # see https://github.com/vllm-project/vllm/issues/8204
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
588
589
    sock.bind((args.host or "", args.port))
    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
590

591
592
593
594
595
596
    def signal_handler(*_) -> None:
        # Interrupt server on sigterm while initializing
        raise KeyboardInterrupt("terminated")

    signal.signal(signal.SIGTERM, signal_handler)

597
    async with build_async_engine_client(args) as engine_client:
598
599
        app = build_app(args)

600
601
        model_config = await engine_client.get_model_config()
        init_app_state(engine_client, model_config, app.state, args)
602
603
604
605
606
607
608
609
610
611
612

        shutdown_task = await serve_http(
            app,
            host=args.host,
            port=args.port,
            log_level=args.uvicorn_log_level,
            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
            ssl_keyfile=args.ssl_keyfile,
            ssl_certfile=args.ssl_certfile,
            ssl_ca_certs=args.ssl_ca_certs,
            ssl_cert_reqs=args.ssl_cert_reqs,
613
614
615
            **uvicorn_kwargs,
        )

616
617
    # NB: Await server shutdown only after the backend context is exited
    await shutdown_task
618

619
620
    sock.close()

Ethan Xu's avatar
Ethan Xu committed
621
622
623
624
625
626
627
628

if __name__ == "__main__":
    # NOTE(simon):
    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible RESTful API server.")
    parser = make_arg_parser(parser)
    args = parser.parse_args()
629
    validate_parsed_serve_args(args)
630

631
    uvloop.run(run_server(args))