evaluator.py 9.89 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
2
import collections
import itertools
3
import numpy as np
Leo Gao's avatar
Leo Gao committed
4
import random
5
import lm_eval.api.metrics
6
7
import lm_eval.models
import lm_eval.tasks
8
9
import lm_eval.api
from lm_eval.utils import positional_deprecated, run_task_tests, make_table
10
import torch 
Fabrizio Milo's avatar
Fabrizio Milo committed
11

12
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
13
14
15
16
17
18
19
20
21
22
23
24
25
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
    num_fewshot=0,
    batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
):
26

27
    """Instantiate and evaluate a model on a list of tasks.
28

29
30
31
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
32
        String arguments for each model class, see LM.create_from_arg_string.
33
34
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
35
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
36
37
38
39
40
    :param num_fewshot: int
        Number of examples in few-shot context
    :param batch_size: int, optional
        Batch size for model
    :param device: str, optional
41
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
42
    :param no_cache: bool
Leo Gao's avatar
Leo Gao committed
43
        Whether or not to cache
44
45
46
47
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
48
49
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
50
    :return
51
        Dictionary of results
52
    """
53
54
55
    random.seed(1234)
    np.random.seed(1234)

56
57
58
    assert tasks != [], "No tasks specified"

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
59
60
        if model_args is None:
            model_args = ""
61
        lm = lm_eval.api.model.get_model(model).create_from_arg_string(
Fabrizio Milo's avatar
Fabrizio Milo committed
62
63
            model_args, {"batch_size": batch_size, "device": device}
        )
64
    else:
65
        assert isinstance(model, lm_eval.api.model.LM)
66
        lm = model
67

68
    task_dict = lm_eval.api.task.get_task_dict(tasks, num_fewshot=num_fewshot)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
69

Stephen Hogg's avatar
Stephen Hogg committed
70
    if check_integrity:
71
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
72

73
74
75
76
77
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=num_fewshot,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
78
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
79
        decontamination_ngrams_path=decontamination_ngrams_path,
80
    )
81

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
            "model": model,
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
            "device": device,
            "no_cache": no_cache,
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }

        return results
    else:
        return None
98

Leo Gao's avatar
Leo Gao committed
99

Fabrizio Milo's avatar
Fabrizio Milo committed
100

101
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
102

Fabrizio Milo's avatar
Fabrizio Milo committed
103

104
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
105
106
107
108
109
110
111
112
def evaluate(
    lm,
    task_dict,
    num_fewshot=0,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
):
113
114
115
116
117
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
118
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
119
120
121
122
123
124
125
126
127
    :param num_fewshot: int
        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :return
        Dictionary of results
    """
128

Leo Gao's avatar
Leo Gao committed
129
    decontaminate = decontamination_ngrams_path is not None
130

Leo Gao's avatar
Leo Gao committed
131
    results = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
132
    versions = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
133
134
135
136
137
138

    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

    docs = {}

139
    # get lists of each type of request
140
    for task_name, task in task_dict.items():
Leo Gao's avatar
Leo Gao committed
141
        versions[task_name] = task.VERSION
142
    
Leo Gao's avatar
Leo Gao committed
143
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
144
145
146
147
148
149
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)

        # for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
150
        task.build_all_requests(limit=limit, rank = lm.rank, world_size = lm.world_size)
151
        # aggregate Instances by LM method requested to get output.
152
153
        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances) 
154
155
156
157
158
159
160
161
162
163

        if lm.world_size > 1:
            instances_rnk = torch.tensor(len(task._instances), device = lm.device)
            gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()

            # compute number of pseudobatches to pad with (FSDP/DDP require even batches + can't use join)
            # we assume rank 0 always has largest iterator
            numpad = gathered_item[0] - gathered_item[lm.rank]
            if numpad > 0:
                print(f"{task_name} / balancing iterators across ranks / rank: {lm.rank} / + {numpad} sample")
164
165
    
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
166
167
    # execute each type of request
    for reqtype, reqs in requests.items():
Leo Gao's avatar
Leo Gao committed
168
        print("Running", reqtype, "requests")
169
170
171
172
173
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
        
174
175
176
177
        if (lm.rank > 0) and (numpad > 0):
            for _ in range(numpad):
                cloned_reqs.extend([req] * req.repeats)

178
179
180
181
182
183
184
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

185
186
187
    if lm.world_size > 1:
        lm.accelerator.wait_for_everyone()

188
189
190
191
192
193
194
195
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
        task.apply_filters()


    ### Collect values of metrics on all datapoints ###
    # TODO: make metric configurable, add metric registry 
Leo Gao's avatar
Leo Gao committed
196
197
198
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
199
200
201
202
    for task_name, task in task_dict.items():
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
203
            for doc_id, doc in itertools.islice(enumerate(task.test_docs()), lm.rank, None, lm.world_size) if task.has_test_docs() else itertools.islice(enumerate(task.validation_docs()), lm.rank, None, lm.world_size):
204
205
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
206
                requests.sort(key=lambda x: x.idx)
207
208
209
210
                metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
    
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
    if lm.world_size > 1:
        # if multigpu, then gather data across all ranks    
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
            
            numitem = 0 
            if type(items[0]) == tuple:
                numitem = len(items[0]) 
    
            # distributed gather requires all ranks to have same dimensionality -> pad out with float32 min value
            pad_value = torch.finfo(torch.float32).min
            metrics_tensor = torch.tensor(items, device = lm.device)
            
            original_dtype = metrics_tensor.dtype # store original dtype 
            torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index = pad_value)
            gathered_item = lm.accelerator.gather(torch_device_tensor)
    
            #TODO: This is required when we get a tensor with a tuple of info like (ppl, _bytes) from wikitext
            if numitem > 0:
                gathered_filtered = gathered_item[gathered_item[:,0] != pad_value]
            else:
                gathered_filtered = gathered_item[gathered_item != pad_value]
                
            gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
    
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
    
        vals = vals_torch
243
244


245
246
247
248
249
250
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
            results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
Leo Gao's avatar
Leo Gao committed
251

252
253
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
254

255
256
257
258
259
260
261
262
263
            stderr = lm_eval.api.metrics.stderr_for_metric(
                metric=task.aggregation()[metric],
                bootstrap_iters=min(bootstrap_iters, 1000)
                if metric in ["bleu", "chrf", "ter"]
                else bootstrap_iters,
            )

            if stderr is not None:
                results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
264

265
        return {"results": dict(results), "versions": dict(versions)}
Fabrizio Milo's avatar
Fabrizio Milo committed
266

267
268
    else:
        return None