bench_other.py 5.96 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
3
import argparse
import json
import time
Liangsheng Yin's avatar
Liangsheng Yin committed
4
5
from concurrent.futures import ThreadPoolExecutor
from functools import partial
Lianmin Zheng's avatar
Lianmin Zheng committed
6
7
8

from tqdm import tqdm

Liangsheng Yin's avatar
Liangsheng Yin committed
9
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
Liangsheng Yin's avatar
Liangsheng Yin committed
10
11
12
from sglang.utils import dump_state_text, read_jsonl

system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
Lianmin Zheng's avatar
Lianmin Zheng committed
13
14

dimension_prompts = [
Liangsheng Yin's avatar
Liangsheng Yin committed
15
16
17
18
19
20
    "Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
    "Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
    "Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
    "Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
    "Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
    "Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
Lianmin Zheng's avatar
Lianmin Zheng committed
21
22
23
24
25
26
27
28
29
]


def multi_dimension_judge(article, generate):
    s = system_prompt
    s += "\n```\n" + article + "\n```\n\n"

    judges = []
    for i in range(len(dimension_prompts)):
Liangsheng Yin's avatar
Liangsheng Yin committed
30
31
32
33
34
35
36
37
38
39
        comp = generate(
            s
            + "USER: Please judge the quality based on the following metric. "
            + dimension_prompts[i]
            + " Please provide a single-paragraph judgement. "
            + "Focus on the provided metric and do not say other things. "
            'End your judgement paragraph with the word "END"\nJUDGE:',
            max_tokens=256,
            stop="END",
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
40
41
42
43
44
45
46
47
48
49
50
51
        judges.append(comp)

    s += "I will judge the quality based on the following metrics.\n"
    for i in range(len(dimension_prompts)):
        s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"

    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
    s += generate(s, max_tokens=2, stop=None)

    return s


Liangsheng Yin's avatar
Liangsheng Yin committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
async def multi_dimension_judge_async(article, generate):
    s = system_prompt
    s += "\n```\n" + article + "\n```\n\n"

    judges = []
    for i in range(len(dimension_prompts)):
        comp = await generate(
            s
            + "USER: Please judge the quality based on the following metric. "
            + dimension_prompts[i]
            + " Please provide a single-paragraph judgement. "
            + "Focus on the provided metric and do not say other things. "
            'End your judgement paragraph with the word "END"\nJUDGE:',
            max_tokens=256,
            stop="END",
        )
        judges.append(comp)

    s += "I will judge the quality based on the following metrics.\n"
    for i in range(len(dimension_prompts)):
        s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"

    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
    s += await generate(s, max_tokens=2, stop=None)

    return s


Lianmin Zheng's avatar
Lianmin Zheng committed
80
def main(args):
Liangsheng Yin's avatar
Liangsheng Yin committed
81
    lines = read_jsonl(args.data_path)[: args.num_questions]
Lianmin Zheng's avatar
Lianmin Zheng committed
82
83
84
    states = [None] * len(lines)

    # Select backend
Liangsheng Yin's avatar
Liangsheng Yin committed
85
    call_generate = partial(get_call_generate(args), temperature=0)
Lianmin Zheng's avatar
Lianmin Zheng committed
86

Liangsheng Yin's avatar
Liangsheng Yin committed
87
    # Run requests
88
    tic = time.perf_counter()
Lianmin Zheng's avatar
Lianmin Zheng committed
89

Liangsheng Yin's avatar
Liangsheng Yin committed
90
    if args.backend != "lmql":
Lianmin Zheng's avatar
Lianmin Zheng committed
91

Liangsheng Yin's avatar
Liangsheng Yin committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
        def get_one_answer(i):
            states[i] = multi_dimension_judge(lines[i], call_generate)

        if args.parallel == 1:
            for i in tqdm(range(len(lines))):
                get_one_answer(i)
        else:
            with ThreadPoolExecutor(args.parallel) as executor:
                list(
                    tqdm(
                        executor.map(get_one_answer, list(range(len(lines)))),
                        total=len(lines),
                    )
                )
Lianmin Zheng's avatar
Lianmin Zheng committed
106
107

    else:
Liangsheng Yin's avatar
Liangsheng Yin committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
        import asyncio

        async def get_one_answer_async(i):
            states[i] = await multi_dimension_judge_async(lines[i], call_generate)

        batches = []
        for i in range(0, len(lines), args.parallel):
            batches.append(list(range(i, min(i + args.parallel, len(lines)))))

        loop = asyncio.get_event_loop()
        for bt in tqdm(batches):
            loop.run_until_complete(
                asyncio.gather(*[get_one_answer_async(i) for i in bt])
            )

123
    latency = time.perf_counter() - tic
Lianmin Zheng's avatar
Lianmin Zheng committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

    # Compute accuracy
    print(f"Latency: {latency:.3f}")

    # Write results
    dump_state_text(f"tmp_output_{args.backend}.txt", states)

    with open(args.result_file, "a") as fout:
        value = {
            "task": "llm_judge",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(latency, 3),
            "num_requests": args.num_questions,
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
Liangsheng Yin's avatar
Liangsheng Yin committed
141
            },
Lianmin Zheng's avatar
Lianmin Zheng committed
142
143
144
145
146
147
148
149
150
151
        }
        fout.write(json.dumps(value) + "\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-path", type=str, default="articles.jsonl")
    parser.add_argument("--num-questions", type=int, default=20)
    args = add_common_other_args_and_parse(parser)
    main(args)