openicl_eval.py 7.67 KB
Newer Older
Leymore's avatar
Leymore committed
1
2
3
import argparse
import os.path as osp
import time
Leymore's avatar
Leymore committed
4
from collections import Counter
Leymore's avatar
Leymore committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from typing import Optional

import mmengine
from mmengine.config import Config, ConfigDict
from mmengine.utils import mkdir_or_exist

from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
                                  TEXT_POSTPROCESSORS)
from opencompass.tasks.base import BaseTask
from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
                               get_logger, task_abbr_from_cfg)


@TASKS.register_module(force=(__name__ == '__main__'))  # A hack for script run
class OpenICLEvalTask(BaseTask):
    """OpenICL Evaluation Task.

    This task is used to evaluate the metric between predictions and
    references.
    """

    name_prefix = 'OpenICLEval'
    log_subdir = 'logs/eval'
    output_subdir = 'results'

    def __init__(self, cfg: ConfigDict):
        super().__init__(cfg)
        self.num_gpus = 0
        self.logger = get_logger()

35
36
37
38
    def get_command(self, cfg_path, template):
        script_path = __file__
        command = f'python3 {script_path} {cfg_path}'
        return template.format(task_cmd=command)
Leymore's avatar
Leymore committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

    def run(self):
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
            for dataset_cfg in dataset_cfgs:
                self.model_cfg = model_cfg
                self.dataset_cfg = dataset_cfg

                # Load Dataset
                self.eval_cfg = self.dataset_cfg.get('eval_cfg')
                self.output_column = dataset_cfg['reader_cfg']['output_column']

                out_path = get_infer_output_path(
                    self.model_cfg, self.dataset_cfg,
                    osp.join(self.work_dir, 'results'))
                if osp.exists(out_path):
                    continue
                self._score()

    def _score(self):
        test_set = build_dataset_from_cfg(self.dataset_cfg).test
        # Postprocess dataset if necessary
        if 'dataset_postprocessor' in self.eval_cfg:
61
            proc = TEXT_POSTPROCESSORS.get(
Leymore's avatar
Leymore committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
                self.eval_cfg['dataset_postprocessor']['type'])

            def postprocess(sample):
                s = sample[self.output_column]
                sample[self.output_column] = proc(s)
                return sample

            test_set = test_set.map(postprocess)

        # Load predictions
        filename = get_infer_output_path(
            self.model_cfg, self.dataset_cfg,
            osp.join(self.work_dir, 'predictions'))
        # in case the prediction is partial
        root, ext = osp.splitext(filename)
        partial_filename = root + '_0' + ext

Leymore's avatar
Leymore committed
79
80
81
        # Get sc_size if use Self-Consistency
        sc_size = self.eval_cfg.get('sc_size')

Leymore's avatar
Leymore committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        if not osp.exists(osp.realpath(filename)) and not osp.exists(
                osp.realpath(partial_filename)):
            result = {'error': 'No predictions found.'}
        else:
            if osp.exists(osp.realpath(filename)):
                preds = mmengine.load(filename)
                pred_strs = [
                    preds[str(i)]['prediction'] for i in range(len(preds))
                ]
            else:
                filename = partial_filename
                pred_strs = []
                i = 1
                while osp.exists(osp.realpath(filename)):
                    preds = mmengine.load(filename)
                    filename = root + f'_{i}' + ext
                    i += 1
                    pred_strs += [
                        preds[str(i)]['prediction'] for i in range(len(preds))
                    ]

            if ('pred_role' in self.eval_cfg
                    and 'meta_template' in self.model_cfg
                    and not MODELS.get(self.model_cfg['type']).is_api):
                # Create a prompt template for role config parsing
                from opencompass.models.base import LMTemplateParser
                parser = LMTemplateParser(self.model_cfg['meta_template'])
                role = parser.roles[self.eval_cfg['pred_role']]
Leymore's avatar
Leymore committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
                if sc_size is not None:
                    for pred in pred_strs:
                        if not isinstance(pred, list):
                            raise TypeError(
                                'The prediction for Self-Consistency'
                                'must be list.')
                        pred_strs.append([
                            self._extract_role_pred(sc_pred,
                                                    role.get('begin', None),
                                                    role.get('end', None))
                            for sc_pred in pred
                        ])
                else:
                    pred_strs = [
                        self._extract_role_pred(pred, role.get('begin', None),
                                                role.get('end', None))
                        for pred in pred_strs
                    ]
Leymore's avatar
Leymore committed
128
129
130

            # Postprocess predictions if necessary
            if 'pred_postprocessor' in self.eval_cfg:
131
132
                kwargs = self.eval_cfg['pred_postprocessor']
                proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
Leymore's avatar
Leymore committed
133
                if sc_size is not None:
134
                    pred_strs = [[proc(s, **kwargs) for s in preds]
liushz's avatar
liushz committed
135
                                 for preds in pred_strs]
Leymore's avatar
Leymore committed
136
                else:
137
                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
Leymore's avatar
Leymore committed
138

liushz's avatar
liushz committed
139
140
141
142
143
144
            # Get majority voting predictions if use self-consistency
            if sc_size is not None:
                pred_strs = [
                    Counter(s).most_common(1)[0][0] for s in pred_strs
                ]

Leymore's avatar
Leymore committed
145
146
147
148
149
150
151
152
            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
            result = icl_evaluator.score(
                predictions=pred_strs, references=test_set[self.output_column])

        if 'error' in result:
            self.logger.error(
                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
            return
153
154
        else:
            self.logger.info(f'Task {task_abbr_from_cfg(self.cfg)}: {result}')
Leymore's avatar
Leymore committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

        # Save result
        out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
                                         osp.join(self.work_dir, 'results'))
        mkdir_or_exist(osp.split(out_path)[0])
        mmengine.dump(result, out_path)

    def _extract_role_pred(self, s: str, begin_str: Optional[str],
                           end_str: Optional[str]) -> str:
        """Extract the role prediction from the full prediction string. The
        role prediction may be the substring between the begin and end string.

        Args:
            s (str): Full prediction string.
            begin_str (str): The beginning string of the role
            end_str (str): The ending string of the role.

        Returns:
            str: The extracted role prediction.
        """
        start = 0
        end = len(s)

        if begin_str:
            begin_idx = s.find(begin_str)
            if begin_idx != -1:
                start = begin_idx + len(begin_str)

        if end_str:
            # TODO: Support calling tokenizer for the accurate eos token
            # and avoid such hardcode
            end_idx = s.find(end_str[:1], start)
            if end_idx != -1:
                end = end_idx

        return s[start:end]


def parse_args():
    parser = argparse.ArgumentParser(description='Score Calculator')
    parser.add_argument('config', help='Config file path')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    cfg = Config.fromfile(args.config)
    start_time = time.time()
    inferencer = OpenICLEvalTask(cfg)
    inferencer.run()
    end_time = time.time()
    get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')