grpc_server.py 5.23 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# mypy: ignore-errors
"""
vLLM gRPC Server

8
9
Starts a gRPC server backed by AsyncLLM, using the VllmEngineServicer
from the smg-grpc-servicer package.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

Usage:
    python -m vllm.entrypoints.grpc_server --model <model_path>

Example:
    python -m vllm.entrypoints.grpc_server \
        --model meta-llama/Llama-2-7b-hf \
        --host 0.0.0.0 \
        --port 50051
"""

import argparse
import asyncio
import signal
import sys
import time

27
28
29
30
31
32
33
34
35
36
37
try:
    import grpc
    from grpc_reflection.v1alpha import reflection
    from smg_grpc_proto import vllm_engine_pb2, vllm_engine_pb2_grpc
    from smg_grpc_servicer.vllm.servicer import VllmEngineServicer
except ImportError:
    raise ImportError(
        "smg-grpc-servicer is required for gRPC mode. "
        "Install it with: pip install vllm[grpc]"
    ) from None

38
39
import uvloop

40
from vllm import envs
41
from vllm.engine.arg_utils import AsyncEngineArgs
42
from vllm.entrypoints.utils import log_version_and_model
43
44
45
46
47
48
49
50
51
52
53
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.version import __version__ as VLLM_VERSION

logger = init_logger(__name__)


async def serve_grpc(args: argparse.Namespace):
    """
54
    Main gRPC serving function.
55
56
57
58

    Args:
        args: Parsed command line arguments
    """
59
60
    log_version_and_model(logger, VLLM_VERSION, args.model)
    logger.info("vLLM gRPC server args: %s", args)
61
62
63
64
65
66
67
68

    start_time = time.time()

    # Create engine args
    engine_args = AsyncEngineArgs.from_cli_args(args)

    # Build vLLM config
    vllm_config = engine_args.create_engine_config(
69
        usage_context=UsageContext.OPENAI_API_SERVER,
70
71
72
73
74
75
76
    )

    # Create AsyncLLM
    async_llm = AsyncLLM.from_vllm_config(
        vllm_config=vllm_config,
        usage_context=UsageContext.OPENAI_API_SERVER,
        enable_log_requests=args.enable_log_requests,
77
        disable_log_stats=args.disable_log_stats,
78
79
80
81
82
83
84
85
86
87
    )

    # Create servicer
    servicer = VllmEngineServicer(async_llm, start_time)

    # Create gRPC server
    server = grpc.aio.server(
        options=[
            ("grpc.max_send_message_length", -1),
            ("grpc.max_receive_message_length", -1),
88
89
90
91
92
            # Tolerate client keepalive pings every 10s (default 300s is too
            # strict for non-streaming requests where no DATA frames flow
            # during generation)
            ("grpc.http2.min_recv_ping_interval_without_data_ms", 10000),
            ("grpc.keepalive_permit_without_calls", True),
93
94
95
96
97
98
99
100
101
102
103
104
105
106
        ],
    )

    # Add servicer to server
    vllm_engine_pb2_grpc.add_VllmEngineServicer_to_server(servicer, server)

    # Enable reflection for grpcurl and other tools
    service_names = (
        vllm_engine_pb2.DESCRIPTOR.services_by_name["VllmEngine"].full_name,
        reflection.SERVICE_NAME,
    )
    reflection.enable_server_reflection(service_names, server)

    # Bind to address
107
108
    host = args.host or "0.0.0.0"
    address = f"{host}:{args.port}"
109
110
    server.add_insecure_port(address)

111
112
113
114
115
    try:
        # Start server
        await server.start()
        logger.info("vLLM gRPC server started on %s", address)
        logger.info("Server is ready to accept requests")
116

117
118
119
120
121
122
123
124
125
126
127
128
        # Start periodic stats logging (mirrors the HTTP server's lifespan task)
        if not args.disable_log_stats:

            async def _force_log():
                while True:
                    await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL)
                    await async_llm.do_log_stats()

            stats_task = asyncio.create_task(_force_log())
        else:
            stats_task = None

129
130
131
        # Handle shutdown signals
        loop = asyncio.get_running_loop()
        stop_event = asyncio.Event()
132

133
134
135
        def signal_handler():
            logger.info("Received shutdown signal")
            stop_event.set()
136

137
138
        for sig in (signal.SIGTERM, signal.SIGINT):
            loop.add_signal_handler(sig, signal_handler)
139

140
141
142
143
        try:
            await stop_event.wait()
        except KeyboardInterrupt:
            logger.info("Interrupted by user")
144
145
    finally:
        logger.info("Shutting down vLLM gRPC server...")
146
147
        if stats_task is not None:
            stats_task.cancel()
148
149
150
151
152
153
154
155
        await server.stop(grace=5.0)
        logger.info("gRPC server stopped")
        async_llm.shutdown()
        logger.info("AsyncLLM engine stopped")
        logger.info("Shutdown complete")


def main():
156
    """Main entry point for python -m vllm.entrypoints.grpc_server."""
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    parser = FlexibleArgumentParser(
        description="vLLM gRPC Server",
    )

    # Server args
    parser.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to bind gRPC server to",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=50051,
        help="Port to bind gRPC server to",
    )
    parser = AsyncEngineArgs.add_cli_args(parser)

    args = parser.parse_args()

    # Run server
    try:
        uvloop.run(serve_grpc(args))
    except Exception as e:
        logger.exception("Server failed: %s", e)
        sys.exit(1)


if __name__ == "__main__":
    main()