server.py 20.6 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
3
4
"""SRT: SGLang Runtime"""
import asyncio
import json
import multiprocessing as mp
Cody Yu's avatar
Cody Yu committed
5
import os
Lianmin Zheng's avatar
Lianmin Zheng committed
6
7
8
import sys
import threading
import time
9
from typing import List, Optional, Union
Lianmin Zheng's avatar
Lianmin Zheng committed
10
11
12
13

# Fix a Python bug
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)

Ying Sheng's avatar
Ying Sheng committed
14
import aiohttp
Lianmin Zheng's avatar
Lianmin Zheng committed
15
import psutil
16
import pydantic
Lianmin Zheng's avatar
Lianmin Zheng committed
17
18
19
import requests
import uvicorn
import uvloop
20
from fastapi import FastAPI, HTTPException, Request
21
from fastapi.responses import Response, StreamingResponse
22
from pydantic import BaseModel
Lianmin Zheng's avatar
Lianmin Zheng committed
23
from sglang.backend.runtime_endpoint import RuntimeEndpoint
Cody Yu's avatar
Cody Yu committed
24
25
26
27
28
29
30
from sglang.srt.conversation import (
    Conversation,
    SeparatorStyle,
    chat_template_exists,
    generate_chat_conv,
    register_conv_template,
)
Ying Sheng's avatar
Ying Sheng committed
31
from sglang.srt.hf_transformers_utils import get_tokenizer
Lianmin Zheng's avatar
Lianmin Zheng committed
32
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
Cody Yu's avatar
Cody Yu committed
33
from sglang.srt.managers.io_struct import DetokenizeReqInput, GenerateReqInput
34
from sglang.srt.managers.openai_protocol import (
Cody Yu's avatar
Cody Yu committed
35
36
37
38
39
40
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseChoice,
    ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse,
    ChatMessage,
41
42
43
44
45
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
Cody Yu's avatar
Cody Yu committed
46
    DeltaMessage,
Cody Yu's avatar
Cody Yu committed
47
    LogProbs,
Cody Yu's avatar
Cody Yu committed
48
    UsageInfo,
49
)
Lianmin Zheng's avatar
Lianmin Zheng committed
50
51
52
from sglang.srt.managers.router.manager import start_router_process
from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.server_args import PortArgs, ServerArgs
53
from sglang.srt.utils import alloc_usable_network_port, handle_port_init
Lianmin Zheng's avatar
Lianmin Zheng committed
54
55
56
57
58
59

asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())


app = FastAPI()
tokenizer_manager = None
Cody Yu's avatar
Cody Yu committed
60
chat_template_name = None
Lianmin Zheng's avatar
Lianmin Zheng committed
61
62


63
64
65
66
67
68
69
70
71
# FIXME: Remove this once we drop support for pydantic 1.x
IS_PYDANTIC_1 = int(pydantic.VERSION.split(".")[0]) == 1

def jsonify_pydantic_model(obj: BaseModel):
    if IS_PYDANTIC_1:
        return obj.json(ensure_ascii=False)
    return obj.model_dump_json()


72
73
74
75
76
77
@app.get("/health")
async def health() -> Response:
    """Health check."""
    return Response(status_code=200)


Lianmin Zheng's avatar
Lianmin Zheng committed
78
79
80
81
82
83
84
@app.get("/get_model_info")
async def get_model_info():
    result = {
        "model_path": tokenizer_manager.model_path,
    }
    return result

Cody Yu's avatar
Cody Yu committed
85

Liangsheng Yin's avatar
Liangsheng Yin committed
86
87
88
89
@app.get("/flush_cache")
async def flush_cache():
    await tokenizer_manager.flush_cache()
    return Response(
90
91
        content="Cache flushed.\nPlease check backend logs for more details. "
        "(When there are running or waiting requests, the operation will not be performed.)\n",
Liangsheng Yin's avatar
Liangsheng Yin committed
92
93
94
95
        status_code=200,
    )


96
97
98
99
async def stream_generator(obj):
    async for out in tokenizer_manager.generate_request(obj):
        yield out

Lianmin Zheng's avatar
Lianmin Zheng committed
100

Cody Yu's avatar
Cody Yu committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
async def make_openai_style_logprobs(token_logprobs):
    ret_logprobs = LogProbs()

    # Detokenize
    token_ids = [tid for tid, _ in token_logprobs]
    token_texts = await tokenizer_manager.detokenize(DetokenizeReqInput(token_ids))

    for token_text, (_, token_logprob) in zip(token_texts, token_logprobs):
        ret_logprobs.tokens.append(token_text)
        ret_logprobs.token_logprobs.append(token_logprob)

        # Not supported yet.
        ret_logprobs.top_logprobs.append({})
        ret_logprobs.text_offset.append(-1)
    return ret_logprobs


Lianmin Zheng's avatar
Lianmin Zheng committed
118
119
120
121
122
123
124
@app.post("/generate")
async def generate_request(obj: GenerateReqInput):
    obj.post_init()

    if obj.stream:

        async def stream_results():
125
126
127
            async for out in stream_generator(obj):
                yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
            yield "data: [DONE]\n\n"
Cody Yu's avatar
Cody Yu committed
128

Lianmin Zheng's avatar
Lianmin Zheng committed
129
        return StreamingResponse(stream_results(), media_type="text/event-stream")
130
131
132

    ret = await tokenizer_manager.generate_request(obj).__anext__()
    return ret
Lianmin Zheng's avatar
Lianmin Zheng committed
133
134
135


@app.post("/v1/completions")
136
137
138
139
140
141
142
143
144
async def v1_completions(raw_request: Request):
    request_json = await raw_request.json()
    request = CompletionRequest(**request_json)

    # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
    assert request.n == 1

    adapted_request = GenerateReqInput(
        text=request.prompt,
Lianmin Zheng's avatar
Lianmin Zheng committed
145
        sampling_params={
146
147
148
149
150
151
            "temperature": request.temperature,
            "max_new_tokens": request.max_tokens,
            "stop": request.stop,
            "top_p": request.top_p,
            "presence_penalty": request.presence_penalty,
            "frequency_penalty": request.frequency_penalty,
Lianmin Zheng's avatar
Lianmin Zheng committed
152
        },
Cody Yu's avatar
Cody Yu committed
153
        return_logprob=request.logprobs is not None,
154
        stream=request.stream,
Lianmin Zheng's avatar
Lianmin Zheng committed
155
    )
156
157
158
    adapted_request.post_init()

    if adapted_request.stream:
Cody Yu's avatar
Cody Yu committed
159

160
161
        async def gnerate_stream_resp():
            stream_buffer = ""
Cody Yu's avatar
Cody Yu committed
162
            n_prev_token = 0
163
164
            async for content in stream_generator(adapted_request):
                text = content["text"]
Cody Yu's avatar
Cody Yu committed
165
166
167
                prompt_tokens = content["meta_info"]["prompt_tokens"]
                completion_tokens = content["meta_info"]["completion_tokens"]

Cody Yu's avatar
Cody Yu committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
                if not stream_buffer: # The first chunk
                    if request.echo:
                        # Prepend prompt in response text.
                        text = request.prompt + text
                    else:
                        # Skip prompt tokens if echo is disabled.
                        n_prev_token = prompt_tokens

                if request.logprobs is not None:
                    logprobs = await make_openai_style_logprobs(
                        content["meta_info"]["token_logprob"][n_prev_token:]
                    )
                    n_prev_token = len(content["meta_info"]["token_logprob"])
                else:
                    logprobs = None

Cody Yu's avatar
Cody Yu committed
184
                delta = text[len(stream_buffer) :]
Cody Yu's avatar
Cody Yu committed
185
                stream_buffer = content["text"]
186
187
188
                choice_data = CompletionResponseStreamChoice(
                    index=0,
                    text=delta,
Cody Yu's avatar
Cody Yu committed
189
                    logprobs=logprobs,
190
191
192
193
194
195
196
                    finish_reason=None,
                )
                chunk = CompletionStreamResponse(
                    id=content["meta_info"]["id"],
                    object="text_completion",
                    choices=[choice_data],
                    model=request.model,
Cody Yu's avatar
Cody Yu committed
197
198
199
200
201
                    usage=UsageInfo(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=prompt_tokens + completion_tokens,
                    ),
202
                )
203
                yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
204
            yield "data: [DONE]\n\n"
205
206
207
208
209
210

        return StreamingResponse(gnerate_stream_resp(), media_type="text/event-stream")

    # Non-streaming response.
    ret = await generate_request(adapted_request)

Cody Yu's avatar
Cody Yu committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    prompt_tokens = ret["meta_info"]["prompt_tokens"]
    completion_tokens = ret["meta_info"]["completion_tokens"]
    text = ret["text"]
    token_logprob_pos = prompt_tokens
    if request.echo:
        token_logprob_pos = 0
        text = request.prompt + text
    else:
        token_logprob_pos = prompt_tokens

    logprobs = (
        await make_openai_style_logprobs(ret["meta_info"]["token_logprob"][token_logprob_pos:])
        if request.logprobs is not None
        else None
    )
226
227
    choice_data = CompletionResponseChoice(
        index=0,
Cody Yu's avatar
Cody Yu committed
228
229
        text=text,
        logprobs=logprobs,
Cody Yu's avatar
Cody Yu committed
230
        finish_reason=None,  # TODO(comaniac): Add finish reason.
231
232
233
234
235
236
237
238
239
240
241
242
243
    )

    response = CompletionResponse(
        id=ret["meta_info"]["id"],
        model=request.model,
        choices=[choice_data],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )
    return response
Lianmin Zheng's avatar
Lianmin Zheng committed
244
245


Cody Yu's avatar
Cody Yu committed
246
247
248
249
250
251
252
253
@app.post("/v1/chat/completions")
async def v1_chat_completions(raw_request: Request):
    request_json = await raw_request.json()
    request = ChatCompletionRequest(**request_json)

    # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
    assert request.n == 1

254
255
256
257
258
    # Prep the data needed for the underlying GenerateReqInput:
    #  - prompt: The full prompt string.
    #  - stop: Custom stop tokens.
    #  - image_data: None or a list of image strings (URLs or base64 strings).
    #    None skips any image processing in GenerateReqInput.
Cody Yu's avatar
Cody Yu committed
259
260
261
    if not isinstance(request.messages, str):
        # Apply chat template and its stop strings.
        if chat_template_name is None:
262
263
264
265
266
267
            # This flow doesn't support the full OpenAI spec.  Verify messages
            # has the right type before proceeding:
            for m in request.messages:
                if not isinstance(m.content, str):
                    raise HTTPException(
                        status_code=503,
Cody Yu's avatar
Cody Yu committed
268
269
270
                        detail="Structured content requests not supported with "
                        "HuggingFace Chat Templates. "
                        "Make sure the server specifies a sglang chat template.",
271
                    )
Cody Yu's avatar
Cody Yu committed
272
273
274
275
            prompt = tokenizer_manager.tokenizer.apply_chat_template(
                request.messages, tokenize=False, add_generation_prompt=True
            )
            stop = request.stop
276
            image_data = None
Cody Yu's avatar
Cody Yu committed
277
278
279
        else:
            conv = generate_chat_conv(request, chat_template_name)
            prompt = conv.get_prompt()
280
            image_data = conv.image_data
Cody Yu's avatar
Cody Yu committed
281
282
283
284
285
286
287
288
289
290
            stop = conv.stop_str or []
            if request.stop:
                if isinstance(request.stop, str):
                    stop.append(request.stop)
                else:
                    stop.extend(request.stop)
    else:
        # Use the raw prompt and stop strings if the messages is already a string.
        prompt = request.messages
        stop = request.stop
291
        image_data = None
Cody Yu's avatar
Cody Yu committed
292
293
294

    adapted_request = GenerateReqInput(
        text=prompt,
295
        image_data=image_data,
Cody Yu's avatar
Cody Yu committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
        sampling_params={
            "temperature": request.temperature,
            "max_new_tokens": request.max_tokens,
            "stop": stop,
            "top_p": request.top_p,
            "presence_penalty": request.presence_penalty,
            "frequency_penalty": request.frequency_penalty,
        },
        stream=request.stream,
    )
    adapted_request.post_init()

    if adapted_request.stream:

        async def gnerate_stream_resp():
            is_first = True

            stream_buffer = ""
            async for content in stream_generator(adapted_request):
                if is_first:
                    # First chunk with role
                    is_first = False
                    choice_data = ChatCompletionResponseStreamChoice(
                        index=0,
                        delta=DeltaMessage(role="assistant"),
                        finish_reason=None,
                    )
                    chunk = ChatCompletionStreamResponse(
324
325
326
                        id=content["meta_info"]["id"],
                        choices=[choice_data],
                        model=request.model,
Cody Yu's avatar
Cody Yu committed
327
                    )
328
                    yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
329
330
331
332
333
334
335
336

                text = content["text"]
                delta = text[len(stream_buffer) :]
                stream_buffer = text
                choice_data = ChatCompletionResponseStreamChoice(
                    index=0, delta=DeltaMessage(content=delta), finish_reason=None
                )
                chunk = ChatCompletionStreamResponse(
337
338
339
                    id=content["meta_info"]["id"],
                    choices=[choice_data],
                    model=request.model,
Cody Yu's avatar
Cody Yu committed
340
                )
341
                yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
            yield "data: [DONE]\n\n"

        return StreamingResponse(gnerate_stream_resp(), media_type="text/event-stream")

    # Non-streaming response.
    ret = await generate_request(adapted_request)
    prompt_tokens = ret["meta_info"]["prompt_tokens"]
    completion_tokens = ret["meta_info"]["completion_tokens"]
    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=ret["text"]),
        finish_reason=None,  # TODO(comaniac): Add finish reason.
    )
    response = ChatCompletionResponse(
        id=ret["meta_info"]["id"],
        model=request.model,
        choices=[choice_data],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )
    return response


Lianmin Zheng's avatar
Lianmin Zheng committed
368
369
def launch_server(server_args, pipe_finish_writer):
    global tokenizer_manager
Cody Yu's avatar
Cody Yu committed
370
    global chat_template_name
Lianmin Zheng's avatar
Lianmin Zheng committed
371

372
373
374
    # Handle ports
    server_args.port, server_args.additional_ports = handle_port_init(
        server_args.port, server_args.additional_ports, server_args.tp_size
Lianmin Zheng's avatar
Lianmin Zheng committed
375
    )
376

Lianmin Zheng's avatar
Lianmin Zheng committed
377
    port_args = PortArgs(
378
379
380
381
382
        tokenizer_port=server_args.additional_ports[0],
        router_port=server_args.additional_ports[1],
        detokenizer_port=server_args.additional_ports[2],
        nccl_port=server_args.additional_ports[3],
        model_rpc_ports=server_args.additional_ports[4:],
Lianmin Zheng's avatar
Lianmin Zheng committed
383
384
    )

Cody Yu's avatar
Cody Yu committed
385
386
    # Load chat template if needed
    if server_args.chat_template is not None:
Lianmin Zheng's avatar
Lianmin Zheng committed
387
        print(f"Use chat template: {server_args.chat_template}")
Cody Yu's avatar
Cody Yu committed
388
389
390
391
392
393
394
395
396
397
398
        if not chat_template_exists(server_args.chat_template):
            if not os.path.exists(server_args.chat_template):
                raise RuntimeError(
                    f"Chat template {server_args.chat_template} is not a built-in template name "
                    "or a valid chat template file path."
                )
            with open(server_args.chat_template, "r") as filep:
                template = json.load(filep)
                try:
                    sep_style = SeparatorStyle[template["sep_style"]]
                except KeyError:
399
400
401
                    raise ValueError(
                        f"Unknown separator style: {template['sep_style']}"
                    ) from None
Cody Yu's avatar
Cody Yu committed
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
                register_conv_template(
                    Conversation(
                        name=template["name"],
                        system_template=template["system"] + "\n{system_message}",
                        system_message=template.get("system_message", ""),
                        roles=(template["user"], template["assistant"]),
                        sep_style=sep_style,
                        sep=template.get("sep", "\n"),
                        stop_str=template["stop_str"],
                    ),
                    override=True,
                )
            chat_template_name = template["name"]
        else:
            chat_template_name = server_args.chat_template

Lianmin Zheng's avatar
Lianmin Zheng committed
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
    # Launch processes
    tokenizer_manager = TokenizerManager(server_args, port_args)
    pipe_router_reader, pipe_router_writer = mp.Pipe(duplex=False)
    pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)

    proc_router = mp.Process(
        target=start_router_process,
        args=(
            server_args,
            port_args,
            pipe_router_writer,
        ),
    )
    proc_router.start()
    proc_detoken = mp.Process(
        target=start_detokenizer_process,
        args=(
            server_args,
            port_args,
            pipe_detoken_writer,
        ),
    )
    proc_detoken.start()

    # Wait for the model to finish loading
    router_init_state = pipe_router_reader.recv()
    detoken_init_state = pipe_detoken_reader.recv()

    if router_init_state != "init ok" or detoken_init_state != "init ok":
        proc_router.kill()
        proc_detoken.kill()
        print("router init state:", router_init_state)
        print("detoken init state:", detoken_init_state)
        sys.exit(1)

    assert proc_router.is_alive() and proc_detoken.is_alive()

Cody Yu's avatar
Cody Yu committed
455
    def _launch_server():
Lianmin Zheng's avatar
Lianmin Zheng committed
456
457
458
459
460
461
462
463
464
465
        # Launch api server
        uvicorn.run(
            app,
            host=server_args.host,
            port=server_args.port,
            log_level=server_args.log_level,
            timeout_keep_alive=5,
            loop="uvloop",
        )

Cody Yu's avatar
Cody Yu committed
466
    t = threading.Thread(target=_launch_server)
Lianmin Zheng's avatar
Lianmin Zheng committed
467
468
    t.start()

Cody Yu's avatar
Cody Yu committed
469
470
471
472
473
474
475
476
477
478
479
    url = server_args.url()
    for _ in range(60):
        time.sleep(1)
        try:
            requests.get(url + "/get_model_info", timeout=5)
            break
        except requests.exceptions.RequestException as e:
            pass
    else:
        if pipe_finish_writer is not None:
            pipe_finish_writer.send(str(e))
Lianmin Zheng's avatar
Lianmin Zheng committed
480
        else:
Cody Yu's avatar
Cody Yu committed
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
            print(e, flush=True)
        return

    # Warmup
    try:
        print("Warmup...", flush=True)
        res = requests.post(
            url + "/generate",
            json={
                "text": "Say this is a warmup request.",
                "sampling_params": {
                    "temperature": 0,
                    "max_new_tokens": 16,
                },
            },
            timeout=60,
        )
        print(f"Warmup done. model response: {res.json()['text']}")
    except requests.exceptions.RequestException as e:
        if pipe_finish_writer is not None:
Lianmin Zheng's avatar
Lianmin Zheng committed
501
            pipe_finish_writer.send(str(e))
Cody Yu's avatar
Cody Yu committed
502
503
504
505
506
507
        else:
            print(e, flush=True)
        return

    if pipe_finish_writer is not None:
        pipe_finish_writer.send("init ok")
Lianmin Zheng's avatar
Lianmin Zheng committed
508
509
510
511
512
513
514
515
516
517


class Runtime:
    def __init__(
        self,
        model_path: str,
        tokenizer_path: Optional[str] = None,
        load_format: str = "auto",
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
518
519
        mem_fraction_static: float = ServerArgs.mem_fraction_static,
        max_prefill_num_token: int = ServerArgs.max_prefill_num_token,
Lianmin Zheng's avatar
Lianmin Zheng committed
520
521
522
523
        tp_size: int = 1,
        model_mode: List[str] = (),
        schedule_heuristic: str = "lpm",
        random_seed: int = 42,
524
        log_level: str = "error",
525
526
        port: Optional[int] = None,
        additional_ports: Optional[Union[List[int], int]] = None,
Lianmin Zheng's avatar
Lianmin Zheng committed
527
528
    ):
        host = "127.0.0.1"
Lianmin Zheng's avatar
Lianmin Zheng committed
529
        port, additional_ports = handle_port_init(port, additional_ports, tp_size)
Ying Sheng's avatar
Ying Sheng committed
530
        self.server_args = ServerArgs(
Lianmin Zheng's avatar
Lianmin Zheng committed
531
532
533
534
            model_path=model_path,
            tokenizer_path=tokenizer_path,
            host=host,
            port=port,
535
            additional_ports=additional_ports,
Lianmin Zheng's avatar
Lianmin Zheng committed
536
537
538
539
            load_format=load_format,
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=trust_remote_code,
            mem_fraction_static=mem_fraction_static,
540
            max_prefill_num_token=max_prefill_num_token,
Lianmin Zheng's avatar
Lianmin Zheng committed
541
542
543
544
545
546
            tp_size=tp_size,
            model_mode=model_mode,
            schedule_heuristic=schedule_heuristic,
            random_seed=random_seed,
            log_level=log_level,
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
547

Ying Sheng's avatar
Ying Sheng committed
548
549
550
551
        self.url = self.server_args.url()
        self.generate_url = (
            f"http://{self.server_args.host}:{self.server_args.port}/generate"
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
552
553
554

        self.pid = None
        pipe_reader, pipe_writer = mp.Pipe(duplex=False)
Ying Sheng's avatar
Ying Sheng committed
555
        proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer))
Lianmin Zheng's avatar
Lianmin Zheng committed
556
        proc.start()
557
        pipe_writer.close()
Lianmin Zheng's avatar
Lianmin Zheng committed
558
559
        self.pid = proc.pid

560
561
562
563
564
        try:
            init_state = pipe_reader.recv()
        except EOFError:
            init_state = ""

Lianmin Zheng's avatar
Lianmin Zheng committed
565
566
        if init_state != "init ok":
            self.shutdown()
567
            raise RuntimeError("Launch failed. Please see the error messages above.")
Lianmin Zheng's avatar
Lianmin Zheng committed
568
569
570
571
572

        self.endpoint = RuntimeEndpoint(self.url)

    def shutdown(self):
        if self.pid is not None:
573
574
575
576
            try:
                parent = psutil.Process(self.pid)
            except psutil.NoSuchProcess:
                return
Lianmin Zheng's avatar
Lianmin Zheng committed
577
578
579
580
581
582
583
584
            children = parent.children(recursive=True)
            for child in children:
                child.kill()
            psutil.wait_procs(children, timeout=5)
            parent.kill()
            parent.wait(timeout=5)
            self.pid = None

Ying Sheng's avatar
Ying Sheng committed
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
    def get_tokenizer(self):
        return get_tokenizer(
            self.server_args.tokenizer_path,
            tokenizer_mode=self.server_args.tokenizer_mode,
            trust_remote_code=self.server_args.trust_remote_code,
        )

    async def add_request(
        self,
        prompt: str,
        sampling_params,
    ) -> None:
        json_data = {
            "text": prompt,
            "sampling_params": sampling_params,
            "stream": True,
        }

        pos = 0

        timeout = aiohttp.ClientTimeout(total=3 * 3600)
        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
            async with session.post(self.generate_url, json=json_data) as response:
                async for chunk, _ in response.content.iter_chunks():
                    chunk = chunk.decode("utf-8")
                    if chunk and chunk.startswith("data:"):
                        if chunk == "data: [DONE]\n\n":
                            break
                        data = json.loads(chunk[5:].strip("\n"))
                        cur = data["text"][pos:]
                        if cur:
                            yield cur
                        pos += len(cur)

Lianmin Zheng's avatar
Lianmin Zheng committed
619
620
    def __del__(self):
        self.shutdown()