long_prompt_multi_turn.py 3.82 KB
Newer Older
1
2
3
4
import json
import random
import time
from argparse import ArgumentParser
5
6
7
from pathlib import Path

from tqdm import tqdm
8
9

import sglang as sgl
10
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
)
from sglang.utils import dump_state_text


def gen_prompt(tokenizer, token_num):
    all_available_tokens = list(tokenizer.get_vocab().values())
    selected_tokens = random.choices(all_available_tokens, k=token_num)
    ret = tokenizer.decode(selected_tokens)
    return ret


25
26
27
28
29
30
31
32
33
def get_cache_path(args):
    # Create cache directory under ~/.cache/sglang
    cache_dir = Path.home() / ".cache" / "sglang"

    # Create a unique cache filename based on the arguments that affect generation
    cache_key = f"qa_{args.num_qa}_{args.turns}_{args.system_prompt_len}_{args.len_q}_{args.len_a}_{args.tokenizer.replace('/', '_')}.json"
    return cache_dir / cache_key


34
def gen_arguments(args, tokenizer):
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
    cache_path = get_cache_path(args)

    # Try to load from cache first
    if cache_path.exists():
        print(f"Loading cached arguments from {cache_path}")
        with open(cache_path, "r") as f:
            return json.load(f)

    print("Generating new arguments...")
    # First progress bar for system prompts
    multi_qas = []
    for _ in tqdm(range(args.num_qa), desc="Generating system prompts"):
        multi_qas.append(
            {"system_prompt": gen_prompt(tokenizer, args.system_prompt_len), "qas": []}
        )

    # Nested progress bars for QA pairs
    for i in tqdm(range(args.num_qa), desc="Generating QA pairs"):
53
54
55
56
57
58
59
60
        qas = multi_qas[i]["qas"]
        for j in range(args.turns):
            qas.append(
                {
                    "prompt": gen_prompt(tokenizer, args.len_q),
                    "new_tokens": args.len_a,
                }
            )
61
62
63
64
65
66
67

    # Save to cache
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    with open(cache_path, "w") as f:
        json.dump(multi_qas, f)
    print(f"Cached arguments saved to {cache_path}")

68
69
70
71
72
73
74
    return multi_qas


@sgl.function
def multi_turns(s, system_prompt, qas):
    s += system_prompt

75
    for i, qa in enumerate(qas):
76
77
78
79
80
81
82
83
84
85
86
        s += qa["prompt"]
        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)


def main(args):
    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)

    multi_qas = gen_arguments(args, tokenizer)

    backend = select_sglang_backend(args)

87
    tic = time.perf_counter()
88
89
90
91
    states = multi_turns.run_batch(
        multi_qas,
        temperature=0,
        backend=backend,
92
        num_threads="auto",
93
94
        progress_bar=True,
    )
95
    latency = time.perf_counter() - tic
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

    print(f"Latency: {latency:.3f}")

    dump_state_text(f"tmp_output_{args.backend}.txt", states)

    with open(args.result_file, "a") as fout:
        value = {
            "task": "multi_turn_system_prompt_chat",
            "backend": args.backend,
            "latency": round(latency, 3),
            "num_requests": args.num_qa,
            "num_turns": args.turns,
            "other": {
                "parallel": args.parallel,
            },
        }
        fout.write(json.dumps(value) + "\n")


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--turns", type=int, default=8)
    parser.add_argument("--num-qa", type=int, default=128)
    parser.add_argument("--system-prompt-len", type=int, default=2048)
    parser.add_argument("--len-q", type=int, default=32)
    parser.add_argument("--len-a", type=int, default=128)
    parser.add_argument(
        "--tokenizer", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
    )
    parser.add_argument("--trust-remote-code", action="store_true")
    args = add_common_sglang_args_and_parse(parser)

    print(args)
    main(args)