"docs/vscode:/vscode.git/clone" did not exist on "4b612664fdfb4e87af6684403872d83ac04fa496"
Unverified Commit 7eb0cb4a authored by Simon Mo's avatar Simon Mo Committed by GitHub
Browse files

Revert "[Frontend] Factor out code for running uvicorn" (#7012)


Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
parent a0dce938
...@@ -60,7 +60,6 @@ files = [ ...@@ -60,7 +60,6 @@ files = [
"vllm/logging", "vllm/logging",
"vllm/multimodal", "vllm/multimodal",
"vllm/platforms", "vllm/platforms",
"vllm/server",
"vllm/transformers_utils", "vllm/transformers_utils",
"vllm/triton_utils", "vllm/triton_utils",
"vllm/usage", "vllm/usage",
......
...@@ -5,12 +5,12 @@ For production use, we recommend using our OpenAI compatible server. ...@@ -5,12 +5,12 @@ For production use, we recommend using our OpenAI compatible server.
We are also not going to accept PRs modifying this file, please We are also not going to accept PRs modifying this file, please
change `vllm/entrypoints/openai/api_server.py` instead. change `vllm/entrypoints/openai/api_server.py` instead.
""" """
import asyncio
import json import json
import ssl import ssl
from argparse import Namespace from typing import AsyncGenerator
from typing import Any, AsyncGenerator, Optional
import uvicorn
from fastapi import FastAPI, Request from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse from fastapi.responses import JSONResponse, Response, StreamingResponse
...@@ -18,10 +18,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -18,10 +18,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.server import serve_http
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger("vllm.entrypoints.api_server") logger = init_logger("vllm.entrypoints.api_server")
...@@ -83,50 +81,6 @@ async def generate(request: Request) -> Response: ...@@ -83,50 +81,6 @@ async def generate(request: Request) -> Response:
return JSONResponse(ret) return JSONResponse(ret)
def build_app(args: Namespace) -> FastAPI:
global app
app.root_path = args.root_path
return app
async def init_app(
args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None,
) -> FastAPI:
app = build_app(args)
global engine
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = (llm_engine
if llm_engine is not None else AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER))
return app
async def run_server(args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs: Any) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
app = await init_app(args, llm_engine)
await serve_http(
app,
host=args.host,
port=args.port,
log_level=args.log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
**uvicorn_kwargs,
)
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default=None) parser.add_argument("--host", type=str, default=None)
...@@ -151,5 +105,25 @@ if __name__ == "__main__": ...@@ -151,5 +105,25 @@ if __name__ == "__main__":
parser.add_argument("--log-level", type=str, default="debug") parser.add_argument("--log-level", type=str, default="debug")
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER)
app.root_path = args.root_path
asyncio.run(run_server(args)) logger.info("Available routes are:")
for route in app.routes:
if not hasattr(route, 'methods'):
continue
methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods)
uvicorn.run(app,
host=args.host,
port=args.port,
log_level=args.log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs)
...@@ -2,12 +2,14 @@ import asyncio ...@@ -2,12 +2,14 @@ import asyncio
import importlib import importlib
import inspect import inspect
import re import re
from argparse import Namespace import signal
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from http import HTTPStatus from http import HTTPStatus
from typing import Any, Optional, Set from typing import Optional, Set
from fastapi import APIRouter, FastAPI, Request import fastapi
import uvicorn
from fastapi import APIRouter, Request
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse from fastapi.responses import JSONResponse, Response, StreamingResponse
...@@ -36,7 +38,6 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding ...@@ -36,7 +38,6 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_tokenization import ( from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization) OpenAIServingTokenization)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.server import serve_http
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
...@@ -56,7 +57,7 @@ _running_tasks: Set[asyncio.Task] = set() ...@@ -56,7 +57,7 @@ _running_tasks: Set[asyncio.Task] = set()
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: fastapi.FastAPI):
async def _force_log(): async def _force_log():
while True: while True:
...@@ -74,7 +75,7 @@ async def lifespan(app: FastAPI): ...@@ -74,7 +75,7 @@ async def lifespan(app: FastAPI):
router = APIRouter() router = APIRouter()
def mount_metrics(app: FastAPI): def mount_metrics(app: fastapi.FastAPI):
# Add prometheus asgi middleware to route /metrics requests # Add prometheus asgi middleware to route /metrics requests
metrics_route = Mount("/metrics", make_asgi_app()) metrics_route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics # Workaround for 307 Redirect for /metrics
...@@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): ...@@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump()) return JSONResponse(content=generator.model_dump())
def build_app(args: Namespace) -> FastAPI: def build_app(args):
app = FastAPI(lifespan=lifespan) app = fastapi.FastAPI(lifespan=lifespan)
app.include_router(router) app.include_router(router)
app.root_path = args.root_path app.root_path = args.root_path
...@@ -213,8 +214,11 @@ def build_app(args: Namespace) -> FastAPI: ...@@ -213,8 +214,11 @@ def build_app(args: Namespace) -> FastAPI:
return app return app
async def init_app(args: Namespace, async def build_server(
llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI: args,
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs,
) -> uvicorn.Server:
app = build_app(args) app = build_app(args)
if args.served_model_name is not None: if args.served_model_name is not None:
...@@ -277,17 +281,14 @@ async def init_app(args: Namespace, ...@@ -277,17 +281,14 @@ async def init_app(args: Namespace,
) )
app.root_path = args.root_path app.root_path = args.root_path
return app logger.info("Available routes are:")
for route in app.routes:
if not hasattr(route, 'methods'):
continue
methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods)
async def run_server(args: Namespace, config = uvicorn.Config(
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs: Any) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
app = await init_app(args, llm_engine)
await serve_http(
app, app,
host=args.host, host=args.host,
port=args.port, port=args.port,
...@@ -300,6 +301,36 @@ async def run_server(args: Namespace, ...@@ -300,6 +301,36 @@ async def run_server(args: Namespace,
**uvicorn_kwargs, **uvicorn_kwargs,
) )
return uvicorn.Server(config)
async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
server = await build_server(
args,
llm_engine,
**uvicorn_kwargs,
)
loop = asyncio.get_running_loop()
server_task = loop.create_task(server.serve())
def signal_handler() -> None:
# prevents the uvicorn signal handler to exit early
server_task.cancel()
loop.add_signal_handler(signal.SIGINT, signal_handler)
loop.add_signal_handler(signal.SIGTERM, signal_handler)
try:
await server_task
except asyncio.CancelledError:
print("Gracefully stopping http server")
await server.shutdown()
if __name__ == "__main__": if __name__ == "__main__":
# NOTE(simon): # NOTE(simon):
...@@ -308,5 +339,4 @@ if __name__ == "__main__": ...@@ -308,5 +339,4 @@ if __name__ == "__main__":
description="vLLM OpenAI-Compatible RESTful API server.") description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser) parser = make_arg_parser(parser)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(run_server(args)) asyncio.run(run_server(args))
from .launch import serve_http
__all__ = ["serve_http"]
import asyncio
import signal
from typing import Any
import uvicorn
from fastapi import FastAPI
from vllm.logger import init_logger
logger = init_logger(__name__)
async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
logger.info("Available routes are:")
for route in app.routes:
methods = getattr(route, "methods", None)
path = getattr(route, "path", None)
if methods is None or path is None:
continue
logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
config = uvicorn.Config(app, **uvicorn_kwargs)
server = uvicorn.Server(config)
loop = asyncio.get_running_loop()
server_task = loop.create_task(server.serve())
def signal_handler() -> None:
# prevents the uvicorn signal handler to exit early
server_task.cancel()
loop.add_signal_handler(signal.SIGINT, signal_handler)
loop.add_signal_handler(signal.SIGTERM, signal_handler)
try:
await server_task
except asyncio.CancelledError:
logger.info("Gracefully stopping http server")
await server.shutdown()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment