evaluator.py 7.49 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
import random
Leo Gao's avatar
Leo Gao committed
2
import itertools
lintangsutawika's avatar
lintangsutawika committed
3
4
import collections

5
import numpy as np
lintangsutawika's avatar
lintangsutawika committed
6
7

import lm_eval.api
8
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
9

10
import lm_eval.tasks
lintangsutawika's avatar
lintangsutawika committed
11
12
import lm_eval.models

lintangsutawika's avatar
lintangsutawika committed
13
14
15
16
17
18
from lm_eval.utils import (
    positional_deprecated,
    run_task_tests,
    make_table,
    get_git_commit_hash,
)
19

lintangsutawika's avatar
lintangsutawika committed
20
21
from lm_eval.logger import eval_logger

Fabrizio Milo's avatar
Fabrizio Milo committed
22

23
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
24
25
26
27
28
29
30
31
32
33
34
35
36
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
    num_fewshot=0,
    batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
):
37

38
    """Instantiate and evaluate a model on a list of tasks.
39

40
41
42
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
43
        String arguments for each model class, see LM.create_from_arg_string.
44
45
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
46
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
47
48
49
50
51
    :param num_fewshot: int
        Number of examples in few-shot context
    :param batch_size: int, optional
        Batch size for model
    :param device: str, optional
52
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
53
    :param no_cache: bool
Leo Gao's avatar
Leo Gao committed
54
        Whether or not to cache
55
56
57
58
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
59
60
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
61
    :return
62
        Dictionary of results
63
    """
64
65
66
    random.seed(1234)
    np.random.seed(1234)

67
68
69
    assert tasks != [], "No tasks specified"

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
70
71
        if model_args is None:
            model_args = ""
72
        lm = lm_eval.api.model.get_model(model).create_from_arg_string(
Fabrizio Milo's avatar
Fabrizio Milo committed
73
74
            model_args, {"batch_size": batch_size, "device": device}
        )
75
    else:
76
        assert isinstance(model, lm_eval.api.model.LM)
77
        lm = model
78

lintangsutawika's avatar
update  
lintangsutawika committed
79
    task_dict = lm_eval.tasks.get_task_dict(tasks, num_fewshot=num_fewshot)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
80

Stephen Hogg's avatar
Stephen Hogg committed
81
    if check_integrity:
82
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
83

84
85
86
87
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
88
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
89
        decontamination_ngrams_path=decontamination_ngrams_path,
90
    )
91
92
93
94
95
96
97
98
99
100

    # add info about the model and few shot config
    results["config"] = {
        "model": model,
        "model_args": model_args,
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
        "device": device,
        "no_cache": no_cache,
        "limit": limit,
101
        "bootstrap_iters": bootstrap_iters,
102
    }
103
    results["git_hash"] = get_git_commit_hash()
104
105

    return results
Leo Gao's avatar
Leo Gao committed
106

Fabrizio Milo's avatar
Fabrizio Milo committed
107

108
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
109

Fabrizio Milo's avatar
Fabrizio Milo committed
110

111
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
112
113
114
115
116
117
118
def evaluate(
    lm,
    task_dict,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
):
119
120
121
122
123
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
124
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
125
126
127
128
129
130
131
132
133
    :param num_fewshot: int
        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :return
        Dictionary of results
    """
134

lintangsutawika's avatar
lintangsutawika committed
135
    # decontaminate = decontamination_ngrams_path is not None
136

Leo Gao's avatar
Leo Gao committed
137
    results = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
138
    versions = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
139
140

    requests = collections.defaultdict(list)
lintangsutawika's avatar
lintangsutawika committed
141
    # requests_origin = collections.defaultdict(list)
Leo Gao's avatar
Leo Gao committed
142

lintangsutawika's avatar
lintangsutawika committed
143
    # docs = {}
Leo Gao's avatar
Leo Gao committed
144

145
    # get lists of each type of request
146
    for task_name, task in task_dict.items():
Leo Gao's avatar
Leo Gao committed
147
        versions[task_name] = task.VERSION
lintangsutawika's avatar
lintangsutawika committed
148

Leo Gao's avatar
Leo Gao committed
149
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
150
151
152
153
154
155
156
157
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)

        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
        task.build_all_requests(limit=limit)
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
158
159
160
161
162
163
164
        reqtype = (
            "loglikelihood"
            if task.OUTPUT_TYPE == "multiple_choice"
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)

165
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
166
167
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
168
        eval_logger.info("Running {} requests".format(reqtype))
169
170
171
172
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
173

174
175
176
177
178
179
180
181
182
183
184
185
186
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
lintangsutawika's avatar
lintangsutawika committed
187
    # TODO: make metric configurable, add metric registry
Leo Gao's avatar
Leo Gao committed
188
189
190
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
191
192
193
194
    for task_name, task in task_dict.items():
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
lintangsutawika's avatar
lintangsutawika committed
195
196
197
198
199
            for doc_id, doc in enumerate(
                itertools.islice(task.test_docs(), 0, limit)
                if task.has_test_docs()
                else task.validation_docs()
            ):
200
201
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
202
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
203
204
205
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
206
207
208
209
210
211
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

    ### Aggregate results over all datapoints ###
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
Leo Gao's avatar
Leo Gao committed
212
        task = task_dict[task_name]
lintangsutawika's avatar
lintangsutawika committed
213
214
215
        results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](
            items
        )
Leo Gao's avatar
Leo Gao committed
216

217
218
        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
219

220
221
        stderr = lm_eval.api.metrics.stderr_for_metric(
            metric=task.aggregation()[metric],
Fabrizio Milo's avatar
Fabrizio Milo committed
222
223
224
            bootstrap_iters=min(bootstrap_iters, 1000)
            if metric in ["bleu", "chrf", "ter"]
            else bootstrap_iters,
225
        )
Fabrizio Milo's avatar
Fabrizio Milo committed
226

Leo Gao's avatar
Leo Gao committed
227
        if stderr is not None:
228
            results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
229
230

    return {"results": dict(results), "versions": dict(versions)}