custom_server.py 1.29 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from sanic import Sanic, text
from sanic.response import json

import sglang as sgl

engine = None

# Create an instance of the Sanic app
app = Sanic("sanic-server")


# Define an asynchronous route handler
@app.route("/generate", methods=["POST"])
async def generate(request):
    prompt = request.json.get("prompt")
    if not prompt:
        return json({"error": "Prompt is required"}, status=400)

    # async_generate returns a dict
    result = await engine.async_generate(prompt)

    return text(result["text"])


@app.route("/generate_stream", methods=["POST"])
async def generate_stream(request):
    prompt = request.json.get("prompt")

    if not prompt:
        return json({"error": "Prompt is required"}, status=400)

    # async_generate returns a dict
    result = await engine.async_generate(prompt, stream=True)

    # https://sanic.dev/en/guide/advanced/streaming.md#streaming
    # init the response
    response = await request.respond()

    # result is an async generator
    async for chunk in result:
        await response.send(chunk["text"])

    await response.eof()


def run_server():
    global engine
    engine = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
    app.run(host="0.0.0.0", port=8000, single_process=True)


if __name__ == "__main__":
    run_server()