srt_example_complete.py 1.46 KB
Newer Older
1
2
3
4
5
"""
Usage:
python3 srt_example_complete.py
"""
import sglang as sgl
Lianmin Zheng's avatar
Lianmin Zheng committed
6
7


8
@sgl.function
Lianmin Zheng's avatar
Lianmin Zheng committed
9
10
11
12
13
14
15
16
17
18
19
def few_shot_qa(s, question):
    s += (
"""The following are questions with answers.
Q: What is the capital of France?
A: Paris
Q: What is the capital of Germany?
A: Berlin
Q: What is the capital of Italy?
A: Rome
""")
    s += "Q: " + question + "\n"
20
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
Lianmin Zheng's avatar
Lianmin Zheng committed
21
22


23
24
25
def single():
    state = few_shot_qa.run(question="What is the capital of the United States?")
    answer = state["answer"].strip().lower()
Lianmin Zheng's avatar
Lianmin Zheng committed
26

27
    assert "washington" in answer, f"answer: {state['answer']}"
Lianmin Zheng's avatar
Lianmin Zheng committed
28

29
    print(state.text())
Lianmin Zheng's avatar
Lianmin Zheng committed
30

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

def stream():
    state = few_shot_qa.run(
        question="What is the capital of the United States?",
        stream=True)

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
    print()


def batch():
    states = few_shot_qa.run_batch([
        {"question": "What is the capital of the United States?"},
        {"question": "What is the capital of China?"},
    ])

    for s in states:
        print(s["answer"])


if __name__ == "__main__":
    runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
    sgl.set_default_backend(runtime)

    # Run a single request
    print("\n========== single ==========\n")
    single()

    # Stream output
    print("\n========== stream ==========\n")
    stream()

    # Run a batch of requests
    print("\n========== batch ==========\n")
    batch()

    runtime.shutdown()