eval_math_llm_judge.py 3.78 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
from mmengine.config import read_base

with read_base():
    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model  # noqa: F401, F403
    from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model  # noqa: F401, F403
    from opencompass.configs.datasets.math.math_llm_judge import math_datasets  # noqa: F401, F403

from opencompass.datasets import math_judement_preprocess
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import AllObjSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask

# -------------Prompt Settings ----------------------------------------
eng_obj_prompt = """
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications

Examples:

    Expression 1: $2x+3$
    Expression 2: $3+2x$

[Yes]

    Expression 1: 3/2
    Expression 2: 1.5

[Yes]

    Expression 1: $x^2+2x+1$
    Expression 2: $y^2+2y+1$

[No]

    Expression 1: $x^2+2x+1$
    Expression 2: $(x+1)^2$

[Yes]

    Expression 1: 3245/5
    Expression 2: 649

[No]
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)

    Expression 1: 2/(-3)
    Expression 2: -2/3

[Yes]
(trivial simplifications are allowed)

    Expression 1: 72 degrees
    Expression 2: 72

[Yes]
(give benefit of the doubt to units)

    Expression 1: 64
    Expression 2: 64 square feet

[Yes]
(give benefit of the doubt to units)

    Expression 1: 64
    Expression 2:

[No]
(only mark as equivalent if both expressions are nonempty)

---

YOUR TASK


Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
    Expression 1: {obj_gold}
    Expression 2: {prediction}

"""

# -------------Inferen Stage ----------------------------------------
# eval models
models = [*hf_llama3_8b_instruct_model]
# judge models
judge_models = hf_llama3_70b_instruct_model

eng_datasets = [*math_datasets]
chn_datasets = []
datasets = eng_datasets + chn_datasets
work_dir = 'outputs/obj_all/'

for d in eng_datasets:
    d['eval_cfg'] = dict(
        evaluator=dict(
            type=LMEvaluator,
            # If you need to preprocess the prediction before judging,
            # you can specify the pred_postprocessor function here
            pred_postprocessor=dict(type=math_judement_preprocess),
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(role='HUMAN', prompt=eng_obj_prompt),
                ]),
            ),
        ),
        pred_role='BOT',
    )

infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=40000),
    runner=dict(type=LocalRunner,
                max_num_workers=256,
                task=dict(type=OpenICLInferTask)),
)

# ------------- Evaluation Configuration --------------------------------
eval = dict(
    partitioner=dict(
        type=SubjectiveSizePartitioner,
        max_task_size=80000,
        mode='singlescore',
        models=models,
        judge_models=judge_models,
    ),
    runner=dict(type=LocalRunner,
                max_num_workers=16,
                task=dict(type=SubjectiveEvalTask)),
)

summarizer = dict(type=AllObjSummarizer)