Unverified Commit ee812580 authored by Daniele's avatar Daniele Committed by GitHub
Browse files

[Frontend] split run_server into build_server and run_server (#6740)

parent 40468b13
...@@ -2,6 +2,7 @@ import asyncio ...@@ -2,6 +2,7 @@ import asyncio
import importlib import importlib
import inspect import inspect
import re import re
import signal
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from http import HTTPStatus from http import HTTPStatus
from typing import Optional, Set from typing import Optional, Set
...@@ -213,12 +214,13 @@ def build_app(args): ...@@ -213,12 +214,13 @@ def build_app(args):
return app return app
def run_server(args, llm_engine=None): async def build_server(
args,
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs,
) -> uvicorn.Server:
app = build_app(args) app = build_app(args)
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
if args.served_model_name is not None: if args.served_model_name is not None:
served_model_names = args.served_model_name served_model_names = args.served_model_name
else: else:
...@@ -231,19 +233,7 @@ def run_server(args, llm_engine=None): ...@@ -231,19 +233,7 @@ def run_server(args, llm_engine=None):
if llm_engine is not None else AsyncLLMEngine.from_engine_args( if llm_engine is not None else AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)) engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
event_loop: Optional[asyncio.AbstractEventLoop] model_config = await engine.get_model_config()
try:
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None
if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
if args.disable_log_requests: if args.disable_log_requests:
request_logger = None request_logger = None
...@@ -296,7 +286,8 @@ def run_server(args, llm_engine=None): ...@@ -296,7 +286,8 @@ def run_server(args, llm_engine=None):
methods = ', '.join(route.methods) methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods) logger.info("Route: %s, Methods: %s", route.path, methods)
uvicorn.run(app, config = uvicorn.Config(
app,
host=args.host, host=args.host,
port=args.port, port=args.port,
log_level=args.uvicorn_log_level, log_level=args.uvicorn_log_level,
...@@ -304,7 +295,39 @@ def run_server(args, llm_engine=None): ...@@ -304,7 +295,39 @@ def run_server(args, llm_engine=None):
ssl_keyfile=args.ssl_keyfile, ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile, ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs, ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs) ssl_cert_reqs=args.ssl_cert_reqs,
**uvicorn_kwargs,
)
return uvicorn.Server(config)
async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
server = await build_server(
args,
llm_engine,
**uvicorn_kwargs,
)
loop = asyncio.get_running_loop()
server_task = loop.create_task(server.serve())
def signal_handler() -> None:
# prevents the uvicorn signal handler to exit early
server_task.cancel()
loop.add_signal_handler(signal.SIGINT, signal_handler)
loop.add_signal_handler(signal.SIGTERM, signal_handler)
try:
await server_task
except asyncio.CancelledError:
print("Gracefully stopping http server")
await server.shutdown()
if __name__ == "__main__": if __name__ == "__main__":
...@@ -314,4 +337,4 @@ if __name__ == "__main__": ...@@ -314,4 +337,4 @@ if __name__ == "__main__":
description="vLLM OpenAI-Compatible RESTful API server.") description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser) parser = make_arg_parser(parser)
args = parser.parse_args() args = parser.parse_args()
run_server(args) asyncio.run(run_server(args))
# The CLI entrypoint to vLLM. # The CLI entrypoint to vLLM.
import argparse import argparse
import asyncio
import os import os
import signal import signal
import sys import sys
...@@ -25,7 +26,7 @@ def serve(args: argparse.Namespace) -> None: ...@@ -25,7 +26,7 @@ def serve(args: argparse.Namespace) -> None:
# EngineArgs expects the model name to be passed as --model. # EngineArgs expects the model name to be passed as --model.
args.model = args.model_tag args.model = args.model_tag
run_server(args) asyncio.run(run_server(args))
def interactive_cli(args: argparse.Namespace) -> None: def interactive_cli(args: argparse.Namespace) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment