grpc_server.py 4.74 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# mypy: ignore-errors
"""
vLLM gRPC Server

8
9
Starts a gRPC server backed by AsyncLLM, using the VllmEngineServicer
from the smg-grpc-servicer package.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

Usage:
    python -m vllm.entrypoints.grpc_server --model <model_path>

Example:
    python -m vllm.entrypoints.grpc_server \
        --model meta-llama/Llama-2-7b-hf \
        --host 0.0.0.0 \
        --port 50051
"""

import argparse
import asyncio
import signal
import sys
import time

27
28
29
30
31
32
33
34
35
36
37
try:
    import grpc
    from grpc_reflection.v1alpha import reflection
    from smg_grpc_proto import vllm_engine_pb2, vllm_engine_pb2_grpc
    from smg_grpc_servicer.vllm.servicer import VllmEngineServicer
except ImportError:
    raise ImportError(
        "smg-grpc-servicer is required for gRPC mode. "
        "Install it with: pip install vllm[grpc]"
    ) from None

38
39
40
import uvloop

from vllm.engine.arg_utils import AsyncEngineArgs
41
from vllm.entrypoints.utils import log_version_and_model
42
43
44
45
46
47
48
49
50
51
52
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.version import __version__ as VLLM_VERSION

logger = init_logger(__name__)


async def serve_grpc(args: argparse.Namespace):
    """
53
    Main gRPC serving function.
54
55
56
57

    Args:
        args: Parsed command line arguments
    """
58
59
    log_version_and_model(logger, VLLM_VERSION, args.model)
    logger.info("vLLM gRPC server args: %s", args)
60
61
62
63
64
65
66
67

    start_time = time.time()

    # Create engine args
    engine_args = AsyncEngineArgs.from_cli_args(args)

    # Build vLLM config
    vllm_config = engine_args.create_engine_config(
68
        usage_context=UsageContext.OPENAI_API_SERVER,
69
70
71
72
73
74
75
    )

    # Create AsyncLLM
    async_llm = AsyncLLM.from_vllm_config(
        vllm_config=vllm_config,
        usage_context=UsageContext.OPENAI_API_SERVER,
        enable_log_requests=args.enable_log_requests,
76
        disable_log_stats=args.disable_log_stats,
77
78
79
80
81
82
83
84
85
86
    )

    # Create servicer
    servicer = VllmEngineServicer(async_llm, start_time)

    # Create gRPC server
    server = grpc.aio.server(
        options=[
            ("grpc.max_send_message_length", -1),
            ("grpc.max_receive_message_length", -1),
87
88
89
90
91
            # Tolerate client keepalive pings every 10s (default 300s is too
            # strict for non-streaming requests where no DATA frames flow
            # during generation)
            ("grpc.http2.min_recv_ping_interval_without_data_ms", 10000),
            ("grpc.keepalive_permit_without_calls", True),
92
93
94
95
96
97
98
99
100
101
102
103
104
105
        ],
    )

    # Add servicer to server
    vllm_engine_pb2_grpc.add_VllmEngineServicer_to_server(servicer, server)

    # Enable reflection for grpcurl and other tools
    service_names = (
        vllm_engine_pb2.DESCRIPTOR.services_by_name["VllmEngine"].full_name,
        reflection.SERVICE_NAME,
    )
    reflection.enable_server_reflection(service_names, server)

    # Bind to address
106
107
    host = args.host or "0.0.0.0"
    address = f"{host}:{args.port}"
108
109
    server.add_insecure_port(address)

110
111
112
113
114
    try:
        # Start server
        await server.start()
        logger.info("vLLM gRPC server started on %s", address)
        logger.info("Server is ready to accept requests")
115

116
117
118
        # Handle shutdown signals
        loop = asyncio.get_running_loop()
        stop_event = asyncio.Event()
119

120
121
122
        def signal_handler():
            logger.info("Received shutdown signal")
            stop_event.set()
123

124
125
        for sig in (signal.SIGTERM, signal.SIGINT):
            loop.add_signal_handler(sig, signal_handler)
126

127
128
129
130
        try:
            await stop_event.wait()
        except KeyboardInterrupt:
            logger.info("Interrupted by user")
131
132
133
134
135
136
137
138
139
140
    finally:
        logger.info("Shutting down vLLM gRPC server...")
        await server.stop(grace=5.0)
        logger.info("gRPC server stopped")
        async_llm.shutdown()
        logger.info("AsyncLLM engine stopped")
        logger.info("Shutdown complete")


def main():
141
    """Main entry point for python -m vllm.entrypoints.grpc_server."""
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
    parser = FlexibleArgumentParser(
        description="vLLM gRPC Server",
    )

    # Server args
    parser.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to bind gRPC server to",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=50051,
        help="Port to bind gRPC server to",
    )
    parser = AsyncEngineArgs.add_cli_args(parser)

    args = parser.parse_args()

    # Run server
    try:
        uvloop.run(serve_grpc(args))
    except Exception as e:
        logger.exception("Server failed: %s", e)
        sys.exit(1)


if __name__ == "__main__":
    main()