"src/vscode:/vscode.git/clone" did not exist on "9147c4c954fd82aeed5146c7a4c304de03e646b2"
async_llm_server.py 4.18 KB
Newer Older
1
2
import asyncio
import time
Zhuohan Li's avatar
Zhuohan Li committed
3
from typing import Dict, Optional
4
5
6
7
8

import ray

from cacheflow.outputs import RequestOutput
from cacheflow.sampling_params import SamplingParams
9
from cacheflow.server.arg_utils import ServerArgs
10
11
from cacheflow.server.llm_server import LLMServer
from cacheflow.server.ray_utils import initialize_cluster
Zhuohan Li's avatar
Zhuohan Li committed
12
from cacheflow.utils import random_uuid
13
14
15
16

TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds


Zhuohan Li's avatar
Zhuohan Li committed
17
class AsyncLLMServer:
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

    def __init__(self, server_use_ray: bool, *args, **kwargs) -> None:
        if server_use_ray:
            remote_server_class = ray.remote(num_cpus=0)(LLMServer)
        else:
            remote_server_class = ray.remote(num_gpus=1)(LLMServer)
        self.server = remote_server_class.remote(*args, **kwargs)

        # Request id -> request output.
        self.request_outputs: Dict[str, RequestOutput] = {}
        # Request id -> event to notify that there is new output.
        self.request_events: Dict[str, asyncio.Event] = {}
        self.is_server_running = False

    async def server_step(self):
        self.is_server_running = True
        request_outputs = await self.server.step.remote()
        self.is_server_running = False
        # Notify the waiting coroutines that there are new outputs ready.
        for request_output in request_outputs:
            request_id = request_output.request_id
            self.request_outputs[request_id] = request_output
            self.request_events[request_id].set()

Zhuohan Li's avatar
Zhuohan Li committed
42
43
    async def generate(self, prompt: str, sampling_params: SamplingParams,
                       request_id: Optional[str] = None) -> RequestOutput:
44
45
46
47
48
        # Preprocess the request.
        arrival_time = time.time()

        # Create an event to notify us that there is new output from the
        # cacheflow server.
Zhuohan Li's avatar
Zhuohan Li committed
49
50
        if request_id is None:
            request_id = random_uuid()
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        request_event = asyncio.Event()
        self.request_events[request_id] = request_event

        # Add the request into the cacheflow server's waiting queue.
        await self.server.add_request.remote(
            request_id, prompt, sampling_params, arrival_time=arrival_time)

        # The cacheflow server does not have a background loop that keeps
        # processing incoming requests. Therefore, we need to keep kicking
        # the server to process the requests.
        while True:
            # Kick the server if the server is not running.
            if not self.is_server_running:
                await self.server_step()

            # Wait for new output. The group_event will be set in server_step
            # when there is new output available for the sequence group.
            # Added a timeout to prevent deadlock.
            try:
                await asyncio.wait_for(request_event.wait(),
                                       timeout=TIMEOUT_TO_PREVENT_DEADLOCK)
            except asyncio.TimeoutError:
                continue
            # Reset the event to wait for the next output.
            request_event.clear()

            # Decode and return new outputs.
            request_output = self.request_outputs[request_id]
Zhuohan Li's avatar
Zhuohan Li committed
79
            yield request_output
80
81

            # Once finished, release the resources of the sequence group.
Zhuohan Li's avatar
Zhuohan Li committed
82
            if request_output.finished():
83
84
85
86
87
88
89
90
91
                del self.request_outputs[request_id]
                del self.request_events[request_id]
                # Kick the server if the server is not running. This is to
                # prevent that there are still requests in server's waiting
                # queue to be executed.
                if not self.is_server_running:
                    await self.server_step()
                break

Zhuohan Li's avatar
Zhuohan Li committed
92
93
94
95
96
97
98
99
100
101
102
103
    @classmethod
    def from_server_args(cls, server_args: ServerArgs) -> "AsyncLLMServer":
        # Create the server configs.
        server_configs = server_args.create_server_configs()
        parallel_config = server_configs[2]
        # Initialize the cluster.
        distributed_init_method, devices = initialize_cluster(parallel_config)
        # Create the LLM server.
        server = cls(server_args.use_ray, *server_configs,
                     distributed_init_method, devices,
                     log_stats=not server_args.disable_log_stats)
        return server