eval_aime24.py 5.08 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard

import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
    TurboMindModelwithChatTemplate,
)

#######################################################################
#                          PART 1  Datasets List                      #
#######################################################################
with read_base():
    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
    # Summarizer
    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups

datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
)

# Set LLM Verifier used for each dataset

verifier_cfg = dict(
    abbr='Qwen3-32B',
    type=OpenAISDK,
    path='/models/qwen3/Qwen3-32B/', # You need to set your own judge model path
    key='EMPTY', # You need to set your own API key
    openai_api_base=[
        'http://0.0.0.0:8000/v1', # You need to set your own API base
    ],
    meta_template=dict(
        round=[
            dict(role='HUMAN', api_role='HUMAN'),
            dict(role='BOT', api_role='BOT', generate=True),
        ],
    ),
    query_per_second=16,
    batch_size=64,
    temperature=0.001,
    tokenizer_path='/models/qwen3/Qwen3-32B/',
    verbose=True,
    max_out_len=16384,
    # max_seq_len=32768,
    pred_postprocessor=dict(type=extract_non_reasoning_content),
    max_seq_len=40960,
)

for item in datasets:
   if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg


#######################################################################
#                          PART 2  Model List                         #
#######################################################################

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

api_meta_template = dict(round=[
    dict(role='HUMAN', api_role='HUMAN'),
    dict(role='BOT', api_role='BOT', generate=True),
], )

models += [
    dict(
        abbr='DeepSeek-R1-INT8',
        type=OpenAISDK,
        path='/nvme/models/DeepSeek-R1-INT8/',
        openai_api_base="http://0.0.0.0:8000/v1",
        tokenizer_path="/nvme/models/DeepSeek-R1-INT8/",
        key='EMPTY',
        meta_template=api_meta_template,
        query_per_second=64,
        max_out_len=32768,
        max_seq_len=32768,
        temperature=0.7,
        pred_postprocessor=dict(type=extract_non_reasoning_content),
        batch_size=32),
]

#######################################################################
#                          PART 3  Inference/Evaluation               #
#######################################################################

# Inference configuration
infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=1
        # Similar with data-parallelism, how many workers for evaluation,
        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
        # to max-utilize the GPUs.
        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
)

# Evaluation configuration
eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
)


#######################################################################
#                          PART 4  Summarizer                         #
#######################################################################


summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
)

summary_groups.extend([
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    {
        'name': 'LiveMathBench-v202412-Hard-Aveage8',
        'subsets':[[
            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
            for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
        ]
    }
])

# Summarizer
summarizer = dict(
    dataset_abbrs=[
        ['AIME2024-Aveage8', 'naive_average'],

    ],
    summary_groups=summary_groups,
)


#######################################################################
#                          PART 5  Utils                              #
#######################################################################

work_dir = "/workspace/logs/aime_r1_int8/"