"vscode:/vscode.git/clone" did not exist on "7a61afb952ba15f68ac6863284231a42eb71643f"
server.py 27.1 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
"""SRT: SGLang Runtime"""
2

Lianmin Zheng's avatar
Lianmin Zheng committed
3
import asyncio
Liangsheng Yin's avatar
Liangsheng Yin committed
4
import dataclasses
Lianmin Zheng's avatar
Lianmin Zheng committed
5
6
import json
import multiprocessing as mp
Cody Yu's avatar
Cody Yu committed
7
import os
Lianmin Zheng's avatar
Lianmin Zheng committed
8
9
10
import sys
import threading
import time
11
from typing import List, Optional, Union
Lianmin Zheng's avatar
Lianmin Zheng committed
12

13
14
15
# Fix a Python bug
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)

Ying Sheng's avatar
Ying Sheng committed
16
import aiohttp
Lianmin Zheng's avatar
Lianmin Zheng committed
17
import psutil
18
import pydantic
Lianmin Zheng's avatar
Lianmin Zheng committed
19
20
21
import requests
import uvicorn
import uvloop
22
from fastapi import FastAPI, HTTPException, Request
23
from fastapi.responses import Response, StreamingResponse
24
from pydantic import BaseModel
Liangsheng Yin's avatar
Liangsheng Yin committed
25
26
27
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import JSONResponse

Lianmin Zheng's avatar
Lianmin Zheng committed
28
from sglang.backend.runtime_endpoint import RuntimeEndpoint
Liangsheng Yin's avatar
Liangsheng Yin committed
29
from sglang.srt.constrained import disable_cache
Cody Yu's avatar
Cody Yu committed
30
31
32
33
34
35
36
from sglang.srt.conversation import (
    Conversation,
    SeparatorStyle,
    chat_template_exists,
    generate_chat_conv,
    register_conv_template,
)
Ying Sheng's avatar
Ying Sheng committed
37
from sglang.srt.hf_transformers_utils import get_tokenizer
Lianmin Zheng's avatar
Lianmin Zheng committed
38
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
Cody Yu's avatar
Cody Yu committed
39
from sglang.srt.managers.io_struct import DetokenizeReqInput, GenerateReqInput
40
from sglang.srt.managers.openai_protocol import (
Cody Yu's avatar
Cody Yu committed
41
42
43
44
45
46
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseChoice,
    ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse,
    ChatMessage,
47
48
49
50
51
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
Cody Yu's avatar
Cody Yu committed
52
    DeltaMessage,
Cody Yu's avatar
Cody Yu committed
53
    LogProbs,
Cody Yu's avatar
Cody Yu committed
54
    UsageInfo,
55
)
Lianmin Zheng's avatar
Lianmin Zheng committed
56
57
58
from sglang.srt.managers.router.manager import start_router_process
from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.server_args import PortArgs, ServerArgs
Liangsheng Yin's avatar
Liangsheng Yin committed
59
from sglang.srt.utils import enable_show_time_cost, handle_port_init
Liangsheng Yin's avatar
Liangsheng Yin committed
60

Lianmin Zheng's avatar
Lianmin Zheng committed
61
62
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

63
64
API_KEY_HEADER_NAME = "X-API-Key"

Lianmin Zheng's avatar
Lianmin Zheng committed
65

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
    def __init__(self, app, api_key: str):
        super().__init__(app)
        self.api_key = api_key

    async def dispatch(self, request: Request, call_next):
        # extract API key from the request headers
        api_key_header = request.headers.get(API_KEY_HEADER_NAME)
        if not api_key_header or api_key_header != self.api_key:
            return JSONResponse(
                status_code=403,
                content={"detail": "Invalid API Key"},
            )
        response = await call_next(request)
        return response
Lianmin Zheng's avatar
Lianmin Zheng committed
81

Lianmin Zheng's avatar
Lianmin Zheng committed
82

Lianmin Zheng's avatar
Lianmin Zheng committed
83
84
app = FastAPI()
tokenizer_manager = None
Cody Yu's avatar
Cody Yu committed
85
chat_template_name = None
Lianmin Zheng's avatar
Lianmin Zheng committed
86
87


88
89
90
# FIXME: Remove this once we drop support for pydantic 1.x
IS_PYDANTIC_1 = int(pydantic.VERSION.split(".")[0]) == 1

91

92
93
94
95
96
97
def jsonify_pydantic_model(obj: BaseModel):
    if IS_PYDANTIC_1:
        return obj.json(ensure_ascii=False)
    return obj.model_dump_json()


98
99
100
101
102
103
@app.get("/health")
async def health() -> Response:
    """Health check."""
    return Response(status_code=200)


Lianmin Zheng's avatar
Lianmin Zheng committed
104
105
106
107
108
109
110
@app.get("/get_model_info")
async def get_model_info():
    result = {
        "model_path": tokenizer_manager.model_path,
    }
    return result

Cody Yu's avatar
Cody Yu committed
111

Liangsheng Yin's avatar
Liangsheng Yin committed
112
113
114
115
116
@app.get("/get_server_args")
async def get_server_args():
    return dataclasses.asdict(tokenizer_manager.server_args)


Liangsheng Yin's avatar
Liangsheng Yin committed
117
118
119
120
@app.get("/flush_cache")
async def flush_cache():
    await tokenizer_manager.flush_cache()
    return Response(
121
122
        content="Cache flushed.\nPlease check backend logs for more details. "
        "(When there are running or waiting requests, the operation will not be performed.)\n",
Liangsheng Yin's avatar
Liangsheng Yin committed
123
124
125
126
        status_code=200,
    )


Liangsheng Yin's avatar
Liangsheng Yin committed
127
128
129
130
131
async def detokenize_logprob_tokens(token_logprobs, decode_to_text):
    if not decode_to_text:
        return [(logprob, token_id, None) for logprob, token_id in token_logprobs]

    token_ids = [tid for _, tid in token_logprobs]
132
    token_texts = await tokenizer_manager.detokenize(DetokenizeReqInput(token_ids))
Liangsheng Yin's avatar
Liangsheng Yin committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    return [
        (logprob, token_id, token_text)
        for (logprob, token_id), token_text, in zip(token_logprobs, token_texts)
    ]


async def detokenize_top_logprobs_tokens(top_logprobs, decode_to_text):
    for i, t in enumerate(top_logprobs):
        if top_logprobs[i] is not None:
            top_logprobs[i] = await detokenize_logprob_tokens(t, decode_to_text)
    return top_logprobs


async def handle_token_logprobs_results(obj: GenerateReqInput, ret):
    """Handle the token logprobs results, convert token ids to text if needed.

    Args:
        obj (GenerateReqInput): The request object.
        ret (Union[Dict, List[Dict]]): The response object.
    """
    # NOTE: This is because the multiple requests in one http request.

    async def convert_style(r, return_text):
        r["meta_info"]["prefill_token_logprobs"] = await detokenize_logprob_tokens(
            r["meta_info"]["prefill_token_logprobs"], return_text
        )
        r["meta_info"]["decode_token_logprobs"] = await detokenize_logprob_tokens(
            r["meta_info"]["decode_token_logprobs"], return_text
        )
        r["meta_info"]["prefill_top_logprobs"] = await detokenize_top_logprobs_tokens(
            r["meta_info"]["prefill_top_logprobs"], return_text
        )
        r["meta_info"]["decode_top_logprobs"] = await detokenize_top_logprobs_tokens(
            r["meta_info"]["decode_top_logprobs"], return_text
        )

    if isinstance(obj.text, str):
        if obj.return_logprob:
            await convert_style(ret, obj.return_text_in_logprobs)
    else:
        for i, r in enumerate(ret):
            if obj.return_logprob[i]:
                await convert_style(r, obj.return_text_in_logprobs)
176
177
178


async def stream_generator(obj: GenerateReqInput):
179
    async for out in tokenizer_manager.generate_request(obj):
Liangsheng Yin's avatar
Liangsheng Yin committed
180
        await handle_token_logprobs_results(obj, out)
181
182
        yield out

Lianmin Zheng's avatar
Lianmin Zheng committed
183

Liangsheng Yin's avatar
Liangsheng Yin committed
184
185
186
187
188
189
async def make_openai_style_logprobs(
    prefill_token_logprobs=None,
    decode_token_logprobs=None,
    prefill_top_logprobs=None,
    decode_top_logprobs=None,
):
Cody Yu's avatar
Cody Yu committed
190
191
    ret_logprobs = LogProbs()

Liangsheng Yin's avatar
Liangsheng Yin committed
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
    def append_token_logprobs(token_logprobs):
        for logprob, _, token_text in token_logprobs:
            ret_logprobs.tokens.append(token_text)
            ret_logprobs.token_logprobs.append(logprob)

            # Not Supported yet
            ret_logprobs.text_offset.append(-1)

    def append_top_logprobs(top_logprobs):
        for tokens in top_logprobs:
            if tokens is not None:
                ret_logprobs.top_logprobs.append(
                    {token[2]: token[0] for token in tokens}
                )
            else:
                ret_logprobs.top_logprobs.append(None)

    if prefill_token_logprobs is not None:
        append_token_logprobs(prefill_token_logprobs)
    if decode_token_logprobs is not None:
        append_token_logprobs(decode_token_logprobs)
    if prefill_top_logprobs is not None:
        append_top_logprobs(prefill_top_logprobs)
    if decode_top_logprobs is not None:
        append_top_logprobs(decode_top_logprobs)
Cody Yu's avatar
Cody Yu committed
217
218
219
220

    return ret_logprobs


Lianmin Zheng's avatar
Lianmin Zheng committed
221
222
223
224
225
226
227
@app.post("/generate")
async def generate_request(obj: GenerateReqInput):
    obj.post_init()

    if obj.stream:

        async def stream_results():
228
229
230
            async for out in stream_generator(obj):
                yield f"data: {json.dumps(out, ensure_ascii=False)}\n\n"
            yield "data: [DONE]\n\n"
Cody Yu's avatar
Cody Yu committed
231

Lianmin Zheng's avatar
Lianmin Zheng committed
232
        return StreamingResponse(stream_results(), media_type="text/event-stream")
233
234

    ret = await tokenizer_manager.generate_request(obj).__anext__()
Liangsheng Yin's avatar
Liangsheng Yin committed
235
    await handle_token_logprobs_results(obj, ret)
236

237
    return ret
Lianmin Zheng's avatar
Lianmin Zheng committed
238
239
240


@app.post("/v1/completions")
241
242
243
244
245
246
247
248
249
async def v1_completions(raw_request: Request):
    request_json = await raw_request.json()
    request = CompletionRequest(**request_json)

    # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
    assert request.n == 1

    adapted_request = GenerateReqInput(
        text=request.prompt,
Lianmin Zheng's avatar
Lianmin Zheng committed
250
        sampling_params={
251
252
253
254
255
256
            "temperature": request.temperature,
            "max_new_tokens": request.max_tokens,
            "stop": request.stop,
            "top_p": request.top_p,
            "presence_penalty": request.presence_penalty,
            "frequency_penalty": request.frequency_penalty,
257
            "regex": request.regex,
Lianmin Zheng's avatar
Lianmin Zheng committed
258
        },
Liangsheng Yin's avatar
Liangsheng Yin committed
259
260
        return_logprob=request.logprobs is not None and request.logprobs > 0,
        top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
261
        return_text_in_logprobs=True,
262
        stream=request.stream,
Lianmin Zheng's avatar
Lianmin Zheng committed
263
    )
264
265
266
    adapted_request.post_init()

    if adapted_request.stream:
Cody Yu's avatar
Cody Yu committed
267

268
269
        async def gnerate_stream_resp():
            stream_buffer = ""
Cody Yu's avatar
Cody Yu committed
270
            n_prev_token = 0
271
272
            async for content in stream_generator(adapted_request):
                text = content["text"]
Cody Yu's avatar
Cody Yu committed
273
274
275
                prompt_tokens = content["meta_info"]["prompt_tokens"]
                completion_tokens = content["meta_info"]["completion_tokens"]

276
                if not stream_buffer:  # The first chunk
Cody Yu's avatar
Cody Yu committed
277
278
279
                    if request.echo:
                        # Prepend prompt in response text.
                        text = request.prompt + text
Liangsheng Yin's avatar
Liangsheng Yin committed
280
281
282
283
284
285
286
287
288
289

                if request.logprobs:
                    # The first chunk and echo is enabled.
                    if not stream_buffer and request.echo:
                        prefill_token_logprobs = content["meta_info"][
                            "prefill_token_logprobs"
                        ]
                        prefill_top_logprobs = content["meta_info"][
                            "prefill_top_logprobs"
                        ]
Cody Yu's avatar
Cody Yu committed
290
                    else:
Liangsheng Yin's avatar
Liangsheng Yin committed
291
292
                        prefill_token_logprobs = None
                        prefill_top_logprobs = None
Cody Yu's avatar
Cody Yu committed
293
294

                    logprobs = await make_openai_style_logprobs(
Liangsheng Yin's avatar
Liangsheng Yin committed
295
296
297
298
299
300
301
302
                        prefill_token_logprobs=prefill_token_logprobs,
                        prefill_top_logprobs=prefill_top_logprobs,
                        decode_token_logprobs=content["meta_info"][
                            "decode_token_logprobs"
                        ][n_prev_token:],
                        decode_top_logprobs=content["meta_info"]["decode_top_logprobs"][
                            n_prev_token:
                        ],
Cody Yu's avatar
Cody Yu committed
303
                    )
Liangsheng Yin's avatar
Liangsheng Yin committed
304
305

                    n_prev_token = len(content["meta_info"]["decode_token_logprobs"])
Cody Yu's avatar
Cody Yu committed
306
307
308
                else:
                    logprobs = None

Cody Yu's avatar
Cody Yu committed
309
                delta = text[len(stream_buffer) :]
Cody Yu's avatar
Cody Yu committed
310
                stream_buffer = content["text"]
311
312
313
                choice_data = CompletionResponseStreamChoice(
                    index=0,
                    text=delta,
Cody Yu's avatar
Cody Yu committed
314
                    logprobs=logprobs,
315
316
317
318
319
320
321
                    finish_reason=None,
                )
                chunk = CompletionStreamResponse(
                    id=content["meta_info"]["id"],
                    object="text_completion",
                    choices=[choice_data],
                    model=request.model,
Cody Yu's avatar
Cody Yu committed
322
323
324
325
326
                    usage=UsageInfo(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=prompt_tokens + completion_tokens,
                    ),
327
                )
328
                yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
329
            yield "data: [DONE]\n\n"
330
331
332
333
334

        return StreamingResponse(gnerate_stream_resp(), media_type="text/event-stream")

    # Non-streaming response.
    ret = await generate_request(adapted_request)
335
    ret = ret[0] if isinstance(ret, list) else ret
336

Cody Yu's avatar
Cody Yu committed
337
338
339
340
341
342
    prompt_tokens = ret["meta_info"]["prompt_tokens"]
    completion_tokens = ret["meta_info"]["completion_tokens"]
    text = ret["text"]
    if request.echo:
        text = request.prompt + text

Liangsheng Yin's avatar
Liangsheng Yin committed
343
344
345
346
347
348
349
350
351
352
353
354
355
    if request.logprobs:
        if request.echo:
            prefill_token_logprobs = ret["meta_info"]["prefill_token_logprobs"]
            prefill_top_logprobs = ret["meta_info"]["prefill_top_logprobs"]
        else:
            prefill_token_logprobs = None
            prefill_top_logprobs = None

        logprobs = await make_openai_style_logprobs(
            prefill_token_logprobs=prefill_token_logprobs,
            prefill_top_logprobs=prefill_top_logprobs,
            decode_token_logprobs=ret["meta_info"]["decode_token_logprobs"],
            decode_top_logprobs=ret["meta_info"]["decode_top_logprobs"],
356
        )
Liangsheng Yin's avatar
Liangsheng Yin committed
357
358
359
    else:
        logprobs = None

360
361
    choice_data = CompletionResponseChoice(
        index=0,
Cody Yu's avatar
Cody Yu committed
362
363
        text=text,
        logprobs=logprobs,
Cody Yu's avatar
Cody Yu committed
364
        finish_reason=None,  # TODO(comaniac): Add finish reason.
365
366
367
368
369
370
371
372
373
374
375
376
377
    )

    response = CompletionResponse(
        id=ret["meta_info"]["id"],
        model=request.model,
        choices=[choice_data],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )
    return response
Lianmin Zheng's avatar
Lianmin Zheng committed
378
379


Cody Yu's avatar
Cody Yu committed
380
381
382
383
384
385
386
387
@app.post("/v1/chat/completions")
async def v1_chat_completions(raw_request: Request):
    request_json = await raw_request.json()
    request = ChatCompletionRequest(**request_json)

    # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
    assert request.n == 1

388
389
390
391
392
    # Prep the data needed for the underlying GenerateReqInput:
    #  - prompt: The full prompt string.
    #  - stop: Custom stop tokens.
    #  - image_data: None or a list of image strings (URLs or base64 strings).
    #    None skips any image processing in GenerateReqInput.
Cody Yu's avatar
Cody Yu committed
393
394
395
    if not isinstance(request.messages, str):
        # Apply chat template and its stop strings.
        if chat_template_name is None:
396
397
398
399
400
401
            # This flow doesn't support the full OpenAI spec.  Verify messages
            # has the right type before proceeding:
            for m in request.messages:
                if not isinstance(m.content, str):
                    raise HTTPException(
                        status_code=503,
Cody Yu's avatar
Cody Yu committed
402
403
404
                        detail="Structured content requests not supported with "
                        "HuggingFace Chat Templates. "
                        "Make sure the server specifies a sglang chat template.",
405
                    )
Cody Yu's avatar
Cody Yu committed
406
407
408
409
            prompt = tokenizer_manager.tokenizer.apply_chat_template(
                request.messages, tokenize=False, add_generation_prompt=True
            )
            stop = request.stop
410
            image_data = None
Cody Yu's avatar
Cody Yu committed
411
412
413
        else:
            conv = generate_chat_conv(request, chat_template_name)
            prompt = conv.get_prompt()
414
            image_data = conv.image_data
Cody Yu's avatar
Cody Yu committed
415
416
417
418
419
420
421
422
423
424
            stop = conv.stop_str or []
            if request.stop:
                if isinstance(request.stop, str):
                    stop.append(request.stop)
                else:
                    stop.extend(request.stop)
    else:
        # Use the raw prompt and stop strings if the messages is already a string.
        prompt = request.messages
        stop = request.stop
425
        image_data = None
Cody Yu's avatar
Cody Yu committed
426
427
428

    adapted_request = GenerateReqInput(
        text=prompt,
429
        image_data=image_data,
Cody Yu's avatar
Cody Yu committed
430
431
432
433
434
435
436
        sampling_params={
            "temperature": request.temperature,
            "max_new_tokens": request.max_tokens,
            "stop": stop,
            "top_p": request.top_p,
            "presence_penalty": request.presence_penalty,
            "frequency_penalty": request.frequency_penalty,
437
            "regex": request.regex,
Cody Yu's avatar
Cody Yu committed
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
        },
        stream=request.stream,
    )
    adapted_request.post_init()

    if adapted_request.stream:

        async def gnerate_stream_resp():
            is_first = True

            stream_buffer = ""
            async for content in stream_generator(adapted_request):
                if is_first:
                    # First chunk with role
                    is_first = False
                    choice_data = ChatCompletionResponseStreamChoice(
                        index=0,
                        delta=DeltaMessage(role="assistant"),
                        finish_reason=None,
                    )
                    chunk = ChatCompletionStreamResponse(
459
460
461
                        id=content["meta_info"]["id"],
                        choices=[choice_data],
                        model=request.model,
Cody Yu's avatar
Cody Yu committed
462
                    )
463
                    yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
464
465
466
467
468
469
470
471

                text = content["text"]
                delta = text[len(stream_buffer) :]
                stream_buffer = text
                choice_data = ChatCompletionResponseStreamChoice(
                    index=0, delta=DeltaMessage(content=delta), finish_reason=None
                )
                chunk = ChatCompletionStreamResponse(
472
473
474
                    id=content["meta_info"]["id"],
                    choices=[choice_data],
                    model=request.model,
Cody Yu's avatar
Cody Yu committed
475
                )
476
                yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
Cody Yu's avatar
Cody Yu committed
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
            yield "data: [DONE]\n\n"

        return StreamingResponse(gnerate_stream_resp(), media_type="text/event-stream")

    # Non-streaming response.
    ret = await generate_request(adapted_request)
    prompt_tokens = ret["meta_info"]["prompt_tokens"]
    completion_tokens = ret["meta_info"]["completion_tokens"]
    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=ret["text"]),
        finish_reason=None,  # TODO(comaniac): Add finish reason.
    )
    response = ChatCompletionResponse(
        id=ret["meta_info"]["id"],
        model=request.model,
        choices=[choice_data],
        usage=UsageInfo(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )
    return response


503
def launch_server(server_args: ServerArgs, pipe_finish_writer):
Lianmin Zheng's avatar
Lianmin Zheng committed
504
    global tokenizer_manager
Cody Yu's avatar
Cody Yu committed
505
    global chat_template_name
Lianmin Zheng's avatar
Lianmin Zheng committed
506

Liangsheng Yin's avatar
Liangsheng Yin committed
507
508
509
510
    # start show time thread
    if server_args.show_time_cost:
        enable_show_time_cost()

511
512
513
514
    # disable disk cache if needed
    if server_args.disable_disk_cache:
        disable_cache()

515
516
517
    # Handle ports
    server_args.port, server_args.additional_ports = handle_port_init(
        server_args.port, server_args.additional_ports, server_args.tp_size
Lianmin Zheng's avatar
Lianmin Zheng committed
518
    )
519

Lianmin Zheng's avatar
Lianmin Zheng committed
520
    port_args = PortArgs(
521
522
523
524
525
        tokenizer_port=server_args.additional_ports[0],
        router_port=server_args.additional_ports[1],
        detokenizer_port=server_args.additional_ports[2],
        nccl_port=server_args.additional_ports[3],
        model_rpc_ports=server_args.additional_ports[4:],
Lianmin Zheng's avatar
Lianmin Zheng committed
526
527
    )

Cody Yu's avatar
Cody Yu committed
528
529
    # Load chat template if needed
    if server_args.chat_template is not None:
Lianmin Zheng's avatar
Lianmin Zheng committed
530
        print(f"Use chat template: {server_args.chat_template}")
Cody Yu's avatar
Cody Yu committed
531
532
533
534
535
536
537
538
539
540
541
        if not chat_template_exists(server_args.chat_template):
            if not os.path.exists(server_args.chat_template):
                raise RuntimeError(
                    f"Chat template {server_args.chat_template} is not a built-in template name "
                    "or a valid chat template file path."
                )
            with open(server_args.chat_template, "r") as filep:
                template = json.load(filep)
                try:
                    sep_style = SeparatorStyle[template["sep_style"]]
                except KeyError:
542
543
544
                    raise ValueError(
                        f"Unknown separator style: {template['sep_style']}"
                    ) from None
Cody Yu's avatar
Cody Yu committed
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
                register_conv_template(
                    Conversation(
                        name=template["name"],
                        system_template=template["system"] + "\n{system_message}",
                        system_message=template.get("system_message", ""),
                        roles=(template["user"], template["assistant"]),
                        sep_style=sep_style,
                        sep=template.get("sep", "\n"),
                        stop_str=template["stop_str"],
                    ),
                    override=True,
                )
            chat_template_name = template["name"]
        else:
            chat_template_name = server_args.chat_template

Lianmin Zheng's avatar
Lianmin Zheng committed
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
    # Launch processes
    tokenizer_manager = TokenizerManager(server_args, port_args)
    pipe_router_reader, pipe_router_writer = mp.Pipe(duplex=False)
    pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)

    proc_router = mp.Process(
        target=start_router_process,
        args=(
            server_args,
            port_args,
            pipe_router_writer,
        ),
    )
    proc_router.start()
    proc_detoken = mp.Process(
        target=start_detokenizer_process,
        args=(
            server_args,
            port_args,
            pipe_detoken_writer,
        ),
    )
    proc_detoken.start()

    # Wait for the model to finish loading
    router_init_state = pipe_router_reader.recv()
    detoken_init_state = pipe_detoken_reader.recv()

    if router_init_state != "init ok" or detoken_init_state != "init ok":
        proc_router.kill()
        proc_detoken.kill()
        print("router init state:", router_init_state)
        print("detoken init state:", detoken_init_state)
        sys.exit(1)

    assert proc_router.is_alive() and proc_detoken.is_alive()

598
599
600
    if server_args.api_key and server_args.api_key != "":
        app.add_middleware(APIKeyValidatorMiddleware, api_key=server_args.api_key)

Cody Yu's avatar
Cody Yu committed
601
    def _launch_server():
Lianmin Zheng's avatar
Lianmin Zheng committed
602
603
604
605
606
607
608
609
610
        uvicorn.run(
            app,
            host=server_args.host,
            port=server_args.port,
            log_level=server_args.log_level,
            timeout_keep_alive=5,
            loop="uvloop",
        )

611
    def _wait_and_warmup():
612
        headers = {}
613
        url = server_args.url()
614
615
616
617
618
        if server_args.api_key and server_args.api_key != "":
            headers[API_KEY_HEADER_NAME] = server_args.api_key

        for _ in range(120):
            time.sleep(0.5)
619
            try:
620
                requests.get(url + "/get_model_info", timeout=5, headers=headers)
621
                break
622
            except requests.exceptions.RequestException as e:
623
624
625
626
627
628
629
                pass
        else:
            if pipe_finish_writer is not None:
                pipe_finish_writer.send(str(e))
            else:
                print(e, flush=True)
            return
Lianmin Zheng's avatar
Lianmin Zheng committed
630

631
        # Warmup
Cody Yu's avatar
Cody Yu committed
632
        try:
633
634
635
636
637
638
639
640
641
642
            # print("Warmup...", flush=True)
            res = requests.post(
                url + "/generate",
                json={
                    "text": "Say this is a warmup request.",
                    "sampling_params": {
                        "temperature": 0,
                        "max_new_tokens": 16,
                    },
                },
643
                headers=headers,
644
645
646
647
                timeout=60,
            )
            # print(f"Warmup done. model response: {res.json()['text']}")
            # print("=" * 20, "Server is ready", "=" * 20, flush=True)
Cody Yu's avatar
Cody Yu committed
648
        except requests.exceptions.RequestException as e:
649
650
651
652
653
            if pipe_finish_writer is not None:
                pipe_finish_writer.send(str(e))
            else:
                print(e, flush=True)
            return
Cody Yu's avatar
Cody Yu committed
654
655

        if pipe_finish_writer is not None:
656
            pipe_finish_writer.send("init ok")
Cody Yu's avatar
Cody Yu committed
657

658
659
660
661
662
663
    t = threading.Thread(target=_wait_and_warmup)
    t.start()
    try:
        _launch_server()
    finally:
        t.join()
Lianmin Zheng's avatar
Lianmin Zheng committed
664
665
666
667
668
669
670
671
672
673


class Runtime:
    def __init__(
        self,
        model_path: str,
        tokenizer_path: Optional[str] = None,
        load_format: str = "auto",
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
674
675
        mem_fraction_static: float = ServerArgs.mem_fraction_static,
        max_prefill_num_token: int = ServerArgs.max_prefill_num_token,
676
        context_length: int = ServerArgs.context_length,
Lianmin Zheng's avatar
Lianmin Zheng committed
677
678
        tp_size: int = 1,
        schedule_heuristic: str = "lpm",
679
        attention_reduce_in_fp32: bool = False,
Lianmin Zheng's avatar
Lianmin Zheng committed
680
        random_seed: int = 42,
681
        log_level: str = "error",
682
683
684
685
        disable_radix_cache: bool = False,
        enable_flashinfer: bool = False,
        disable_regex_jump_forward: bool = False,
        disable_disk_cache: bool = False,
686
        api_key: str = "",
687
688
        port: Optional[int] = None,
        additional_ports: Optional[Union[List[int], int]] = None,
Lianmin Zheng's avatar
Lianmin Zheng committed
689
690
    ):
        host = "127.0.0.1"
Lianmin Zheng's avatar
Lianmin Zheng committed
691
        port, additional_ports = handle_port_init(port, additional_ports, tp_size)
Ying Sheng's avatar
Ying Sheng committed
692
        self.server_args = ServerArgs(
Lianmin Zheng's avatar
Lianmin Zheng committed
693
694
695
696
            model_path=model_path,
            tokenizer_path=tokenizer_path,
            host=host,
            port=port,
697
            additional_ports=additional_ports,
Lianmin Zheng's avatar
Lianmin Zheng committed
698
699
700
701
            load_format=load_format,
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=trust_remote_code,
            mem_fraction_static=mem_fraction_static,
702
            max_prefill_num_token=max_prefill_num_token,
703
            context_length=context_length,
Lianmin Zheng's avatar
Lianmin Zheng committed
704
705
            tp_size=tp_size,
            schedule_heuristic=schedule_heuristic,
706
            attention_reduce_in_fp32=attention_reduce_in_fp32,
Lianmin Zheng's avatar
Lianmin Zheng committed
707
708
            random_seed=random_seed,
            log_level=log_level,
709
710
711
712
            disable_radix_cache=disable_radix_cache,
            enable_flashinfer=enable_flashinfer,
            disable_regex_jump_forward=disable_regex_jump_forward,
            disable_disk_cache=disable_disk_cache,
713
            api_key=api_key,
Lianmin Zheng's avatar
Lianmin Zheng committed
714
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
715

Ying Sheng's avatar
Ying Sheng committed
716
717
718
719
        self.url = self.server_args.url()
        self.generate_url = (
            f"http://{self.server_args.host}:{self.server_args.port}/generate"
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
720
721
722

        self.pid = None
        pipe_reader, pipe_writer = mp.Pipe(duplex=False)
Ying Sheng's avatar
Ying Sheng committed
723
        proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer))
Lianmin Zheng's avatar
Lianmin Zheng committed
724
        proc.start()
725
        pipe_writer.close()
Lianmin Zheng's avatar
Lianmin Zheng committed
726
727
        self.pid = proc.pid

728
729
730
731
732
        try:
            init_state = pipe_reader.recv()
        except EOFError:
            init_state = ""

Lianmin Zheng's avatar
Lianmin Zheng committed
733
734
        if init_state != "init ok":
            self.shutdown()
735
            raise RuntimeError("Launch failed. Please see the error messages above.")
Lianmin Zheng's avatar
Lianmin Zheng committed
736
737
738
739
740

        self.endpoint = RuntimeEndpoint(self.url)

    def shutdown(self):
        if self.pid is not None:
741
742
743
744
            try:
                parent = psutil.Process(self.pid)
            except psutil.NoSuchProcess:
                return
Lianmin Zheng's avatar
Lianmin Zheng committed
745
746
747
748
749
750
751
752
            children = parent.children(recursive=True)
            for child in children:
                child.kill()
            psutil.wait_procs(children, timeout=5)
            parent.kill()
            parent.wait(timeout=5)
            self.pid = None

Ying Sheng's avatar
Ying Sheng committed
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
    def get_tokenizer(self):
        return get_tokenizer(
            self.server_args.tokenizer_path,
            tokenizer_mode=self.server_args.tokenizer_mode,
            trust_remote_code=self.server_args.trust_remote_code,
        )

    async def add_request(
        self,
        prompt: str,
        sampling_params,
    ) -> None:
        json_data = {
            "text": prompt,
            "sampling_params": sampling_params,
            "stream": True,
        }

        pos = 0

        timeout = aiohttp.ClientTimeout(total=3 * 3600)
        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
            async with session.post(self.generate_url, json=json_data) as response:
                async for chunk, _ in response.content.iter_chunks():
                    chunk = chunk.decode("utf-8")
                    if chunk and chunk.startswith("data:"):
                        if chunk == "data: [DONE]\n\n":
                            break
                        data = json.loads(chunk[5:].strip("\n"))
                        cur = data["text"][pos:]
                        if cur:
                            yield cur
                        pos += len(cur)

Lianmin Zheng's avatar
Lianmin Zheng committed
787
788
    def __del__(self):
        self.shutdown()