grpc_server.py 5.93 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# mypy: ignore-errors
"""
vLLM gRPC Server

8
9
Starts a gRPC server backed by AsyncLLM, using the VllmEngineServicer
from the smg-grpc-servicer package.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

Usage:
    python -m vllm.entrypoints.grpc_server --model <model_path>

Example:
    python -m vllm.entrypoints.grpc_server \
        --model meta-llama/Llama-2-7b-hf \
        --host 0.0.0.0 \
        --port 50051
"""

import argparse
import asyncio
import signal
import sys
import time

27
28
try:
    import grpc
29
    from grpc_health.v1 import health_pb2_grpc
30
31
    from grpc_reflection.v1alpha import reflection
    from smg_grpc_proto import vllm_engine_pb2, vllm_engine_pb2_grpc
32
    from smg_grpc_servicer.vllm.health_servicer import VllmHealthServicer
33
    from smg_grpc_servicer.vllm.servicer import VllmEngineServicer
34
except ImportError as e:
35
    raise ImportError(
36
37
38
39
40
        "gRPC mode requires smg-grpc-servicer. "
        "If not installed, run: pip install vllm[grpc]. "
        "If already installed, there may be a broken import due to a "
        "version mismatch — see the chained exception above for details."
    ) from e
41

42
43
import uvloop

44
from vllm import envs
45
from vllm.engine.arg_utils import AsyncEngineArgs
46
from vllm.entrypoints.utils import log_version_and_model
47
48
49
50
51
52
53
54
55
56
57
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.version import __version__ as VLLM_VERSION

logger = init_logger(__name__)


async def serve_grpc(args: argparse.Namespace):
    """
58
    Main gRPC serving function.
59
60
61
62

    Args:
        args: Parsed command line arguments
    """
63
64
    log_version_and_model(logger, VLLM_VERSION, args.model)
    logger.info("vLLM gRPC server args: %s", args)
65
66
67
68
69
70
71
72

    start_time = time.time()

    # Create engine args
    engine_args = AsyncEngineArgs.from_cli_args(args)

    # Build vLLM config
    vllm_config = engine_args.create_engine_config(
73
        usage_context=UsageContext.OPENAI_API_SERVER,
74
75
76
77
78
79
80
    )

    # Create AsyncLLM
    async_llm = AsyncLLM.from_vllm_config(
        vllm_config=vllm_config,
        usage_context=UsageContext.OPENAI_API_SERVER,
        enable_log_requests=args.enable_log_requests,
81
        disable_log_stats=args.disable_log_stats,
82
83
84
85
86
87
88
89
90
91
    )

    # Create servicer
    servicer = VllmEngineServicer(async_llm, start_time)

    # Create gRPC server
    server = grpc.aio.server(
        options=[
            ("grpc.max_send_message_length", -1),
            ("grpc.max_receive_message_length", -1),
92
93
94
95
96
            # Tolerate client keepalive pings every 10s (default 300s is too
            # strict for non-streaming requests where no DATA frames flow
            # during generation)
            ("grpc.http2.min_recv_ping_interval_without_data_ms", 10000),
            ("grpc.keepalive_permit_without_calls", True),
97
98
99
100
101
102
        ],
    )

    # Add servicer to server
    vllm_engine_pb2_grpc.add_VllmEngineServicer_to_server(servicer, server)

103
104
105
106
    # Add standard gRPC health service for Kubernetes probes
    health_servicer = VllmHealthServicer(async_llm)
    health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)

107
108
109
    # Enable reflection for grpcurl and other tools
    service_names = (
        vllm_engine_pb2.DESCRIPTOR.services_by_name["VllmEngine"].full_name,
110
        "grpc.health.v1.Health",
111
112
113
114
115
        reflection.SERVICE_NAME,
    )
    reflection.enable_server_reflection(service_names, server)

    # Bind to address
116
117
    host = args.host or "0.0.0.0"
    address = f"{host}:{args.port}"
118
119
    server.add_insecure_port(address)

120
121
122
123
124
    try:
        # Start server
        await server.start()
        logger.info("vLLM gRPC server started on %s", address)
        logger.info("Server is ready to accept requests")
125

126
127
128
129
130
131
132
133
134
135
136
137
        # Start periodic stats logging (mirrors the HTTP server's lifespan task)
        if not args.disable_log_stats:

            async def _force_log():
                while True:
                    await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL)
                    await async_llm.do_log_stats()

            stats_task = asyncio.create_task(_force_log())
        else:
            stats_task = None

138
139
140
        # Handle shutdown signals
        loop = asyncio.get_running_loop()
        stop_event = asyncio.Event()
141

142
143
144
        def signal_handler():
            logger.info("Received shutdown signal")
            stop_event.set()
145

146
147
        for sig in (signal.SIGTERM, signal.SIGINT):
            loop.add_signal_handler(sig, signal_handler)
148

149
150
151
152
        try:
            await stop_event.wait()
        except KeyboardInterrupt:
            logger.info("Interrupted by user")
153
154
    finally:
        logger.info("Shutting down vLLM gRPC server...")
155
156
        if stats_task is not None:
            stats_task.cancel()
157
158
159
160
        try:
            health_servicer.set_not_serving()
        except Exception:  # broad: must not prevent server.stop() / shutdown()
            logger.warning("Failed to set health status to NOT_SERVING", exc_info=True)
161
162
163
164
165
166
167
168
        await server.stop(grace=5.0)
        logger.info("gRPC server stopped")
        async_llm.shutdown()
        logger.info("AsyncLLM engine stopped")
        logger.info("Shutdown complete")


def main():
169
    """Main entry point for python -m vllm.entrypoints.grpc_server."""
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
    parser = FlexibleArgumentParser(
        description="vLLM gRPC Server",
    )

    # Server args
    parser.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to bind gRPC server to",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=50051,
        help="Port to bind gRPC server to",
    )
    parser = AsyncEngineArgs.add_cli_args(parser)

    args = parser.parse_args()

    # Run server
    try:
        uvloop.run(serve_grpc(args))
    except Exception as e:
        logger.exception("Server failed: %s", e)
        sys.exit(1)


if __name__ == "__main__":
    main()