evaluator.py 7.13 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
2
import collections
import itertools
3
import numpy as np
Leo Gao's avatar
Leo Gao committed
4
import random
5
import lm_eval.api.metrics
6
7
import lm_eval.models
import lm_eval.tasks
8
9
import lm_eval.api
from lm_eval.utils import positional_deprecated, run_task_tests, make_table
10

Fabrizio Milo's avatar
Fabrizio Milo committed
11

12
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
13
14
15
16
17
18
19
20
21
22
23
24
25
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
    num_fewshot=0,
    batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
):
26

27
    """Instantiate and evaluate a model on a list of tasks.
28

29
30
31
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
32
        String arguments for each model class, see LM.create_from_arg_string.
33
34
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
35
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
36
37
38
39
40
    :param num_fewshot: int
        Number of examples in few-shot context
    :param batch_size: int, optional
        Batch size for model
    :param device: str, optional
41
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
42
    :param no_cache: bool
Leo Gao's avatar
Leo Gao committed
43
        Whether or not to cache
44
45
46
47
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
48
49
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
50
    :return
51
        Dictionary of results
52
    """
53
54
55
    random.seed(1234)
    np.random.seed(1234)

56
57
58
    assert tasks != [], "No tasks specified"

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
59
60
        if model_args is None:
            model_args = ""
61
        lm = lm_eval.api.model.get_model(model).create_from_arg_string(
Fabrizio Milo's avatar
Fabrizio Milo committed
62
63
            model_args, {"batch_size": batch_size, "device": device}
        )
64
    else:
65
        assert isinstance(model, lm_eval.api.model.LM)
66
        lm = model
67

68
    task_dict = lm_eval.api.task.get_task_dict(tasks, num_fewshot=num_fewshot)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
69

Stephen Hogg's avatar
Stephen Hogg committed
70
    if check_integrity:
71
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
72

73
74
75
76
77
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=num_fewshot,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
78
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
79
        decontamination_ngrams_path=decontamination_ngrams_path,
80
    )
81
82
83
84
85
86
87
88
89
90

    # add info about the model and few shot config
    results["config"] = {
        "model": model,
        "model_args": model_args,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
        "device": device,
        "no_cache": no_cache,
        "limit": limit,
91
        "bootstrap_iters": bootstrap_iters,
92
93
94
    }

    return results
Leo Gao's avatar
Leo Gao committed
95

Fabrizio Milo's avatar
Fabrizio Milo committed
96

97
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
98

Fabrizio Milo's avatar
Fabrizio Milo committed
99

100
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
101
102
103
104
105
106
107
108
def evaluate(
    lm,
    task_dict,
    num_fewshot=0,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
):
109
110
111
112
113
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
114
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
115
116
117
118
119
120
121
122
123
    :param num_fewshot: int
        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :return
        Dictionary of results
    """
124

Leo Gao's avatar
Leo Gao committed
125
    decontaminate = decontamination_ngrams_path is not None
126

Leo Gao's avatar
Leo Gao committed
127
    results = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
128
    versions = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
129
130
131
132
133
134

    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

    docs = {}

135
    # get lists of each type of request
136
    for task_name, task in task_dict.items():
Leo Gao's avatar
Leo Gao committed
137
        versions[task_name] = task.VERSION
138
    
Leo Gao's avatar
Leo Gao committed
139
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
140
141
142
143
144
145
146
147
148
149
150
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)

        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
        requests[task.OUTPUT_TYPE].extend(task.instances) 
    
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
151
152
    # execute each type of request
    for reqtype, reqs in requests.items():
Leo Gao's avatar
Leo Gao committed
153
        print("Running", reqtype, "requests")
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
        
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
        task.apply_filters()


    ### Collect values of metrics on all datapoints ###
    # TODO: make metric configurable, add metric registry 
Leo Gao's avatar
Leo Gao committed
174
175
176
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    for task_name, task in task_dict.items():
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
            for doc_id, doc in enumerate(itertools.islice(task.test_docs(), 0, limit) if task.has_test_docs() else task.validation_docs()):
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
                requests.sort(key=lambda x: x.id_)
                metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
    


    ### Aggregate results over all datapoints ###
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
Leo Gao's avatar
Leo Gao committed
194
        task = task_dict[task_name]
195
        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
Leo Gao's avatar
Leo Gao committed
196

197
198
        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
199

200
201
        stderr = lm_eval.api.metrics.stderr_for_metric(
            metric=task.aggregation()[metric],
Fabrizio Milo's avatar
Fabrizio Milo committed
202
203
204
            bootstrap_iters=min(bootstrap_iters, 1000)
            if metric in ["bleu", "chrf", "ter"]
            else bootstrap_iters,
205
        )
Fabrizio Milo's avatar
Fabrizio Milo committed
206

Leo Gao's avatar
Leo Gao committed
207
        if stderr is not None:
208
            results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
209
210

    return {"results": dict(results), "versions": dict(versions)}