"torchvision/tv_tensors/_dataset_wrapper.py" did not exist on "b570f2c17130c30be56a276aa0d1ed11a096dad1"
evaluator.py 9.59 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
2
import collections
import itertools
3
import numpy as np
Leo Gao's avatar
Leo Gao committed
4
import random
5
import lm_eval.api.metrics
6
7
import lm_eval.models
import lm_eval.tasks
8
import lm_eval.api
9
from lm_eval.utils import positional_deprecated, run_task_tests, make_table, create_iterator
10
import torch 
Fabrizio Milo's avatar
Fabrizio Milo committed
11

12
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
13
14
15
16
17
18
19
20
21
22
23
24
25
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
    num_fewshot=0,
    batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
):
26

27
    """Instantiate and evaluate a model on a list of tasks.
28

29
30
31
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
32
        String arguments for each model class, see LM.create_from_arg_string.
33
34
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
35
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
36
37
38
39
40
    :param num_fewshot: int
        Number of examples in few-shot context
    :param batch_size: int, optional
        Batch size for model
    :param device: str, optional
41
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
42
    :param no_cache: bool
Leo Gao's avatar
Leo Gao committed
43
        Whether or not to cache
44
45
46
47
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
48
49
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
50
    :return
51
        Dictionary of results
52
    """
53
54
55
    random.seed(1234)
    np.random.seed(1234)

56
57
58
    assert tasks != [], "No tasks specified"

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
59
60
        if model_args is None:
            model_args = ""
61
        lm = lm_eval.api.model.get_model(model).create_from_arg_string(
Fabrizio Milo's avatar
Fabrizio Milo committed
62
63
            model_args, {"batch_size": batch_size, "device": device}
        )
64
    else:
65
        assert isinstance(model, lm_eval.api.model.LM)
66
        lm = model
67

68
    task_dict = lm_eval.api.task.get_task_dict(tasks, num_fewshot=num_fewshot)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
69

Stephen Hogg's avatar
Stephen Hogg committed
70
    if check_integrity:
71
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
72

73
74
75
76
77
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=num_fewshot,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
78
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
79
        decontamination_ngrams_path=decontamination_ngrams_path,
80
    )
81

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
            "model": model,
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
            "device": device,
            "no_cache": no_cache,
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }

        return results
    else:
        return None
98

Leo Gao's avatar
Leo Gao committed
99

Fabrizio Milo's avatar
Fabrizio Milo committed
100

101
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
102

Fabrizio Milo's avatar
Fabrizio Milo committed
103

104
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
105
106
107
108
109
110
111
112
def evaluate(
    lm,
    task_dict,
    num_fewshot=0,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
):
113
114
115
116
117
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
118
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
119
120
121
122
123
124
125
126
127
    :param num_fewshot: int
        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :return
        Dictionary of results
    """
128

Leo Gao's avatar
Leo Gao committed
129
    decontaminate = decontamination_ngrams_path is not None
130

Leo Gao's avatar
Leo Gao committed
131
    results = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
132
    versions = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
133
134
135
136
137
138

    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

    docs = {}

139
    # get lists of each type of request
140
    for task_name, task in task_dict.items():
Leo Gao's avatar
Leo Gao committed
141
        versions[task_name] = task.VERSION
142
    
Leo Gao's avatar
Leo Gao committed
143
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
144
145
146
147
148
        # task_docs = list(task_doc_func())
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)

149
        task.build_all_requests(limit=limit, rank = lm.rank, world_size = lm.world_size)
150
        # aggregate Instances by LM method requested to get output.
151
152
        reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE #TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances) 
153
154
155
156
157

        if lm.world_size > 1:
            instances_rnk = torch.tensor(len(task._instances), device = lm.device)
            gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()

158
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
159
            numpad = max(gathered_item) - gathered_item[lm.rank]
160
161
    
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
162
163
    # execute each type of request
    for reqtype, reqs in requests.items():
Leo Gao's avatar
Leo Gao committed
164
        print("Running", reqtype, "requests")
165
166
167
168
169
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
        
170
        if (lm.world_size > 1) and (numpad > 0):
171
172
173
            for _ in range(numpad):
                cloned_reqs.extend([req] * req.repeats)

174
175
176
177
178
179
180
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

181
182
183
    if lm.world_size > 1:
        lm.accelerator.wait_for_everyone()

184
185
186
187
188
189
190
191
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
        task.apply_filters()


    ### Collect values of metrics on all datapoints ###
    # TODO: make metric configurable, add metric registry 
Leo Gao's avatar
Leo Gao committed
192
193
194
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
195
196
197
198
    for task_name, task in task_dict.items():
        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
        # TODO: make it possible to use a different metric per key
        for key in task.instances[0].filtered_resps.keys():
Benjamin Fattori's avatar
Benjamin Fattori committed
199
            doc_iterator = itertools.islice(enumerate(task.test_docs()), lm.rank, limit, lm.world_size) if task.has_test_docs() else itertools.islice(enumerate(task.validation_docs()), lm.rank, limit, lm.world_size)
200
            for doc_id, doc in doc_iterator:
201
202
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
203
                requests.sort(key=lambda x: x.idx)
204
205
206
207
                metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
    
208
209
210
211
212
213
214
215
216
    if lm.world_size > 1:
        # if multigpu, then gather data across all ranks    
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
            
            numitem = 0 
            if type(items[0]) == tuple:
                numitem = len(items[0]) 
    
217
218
            # distributed gather requires all ranks to have same dimensions
            # so we pad out with float32 min value
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
            pad_value = torch.finfo(torch.float32).min
            metrics_tensor = torch.tensor(items, device = lm.device)
            
            original_dtype = metrics_tensor.dtype # store original dtype 
            torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index = pad_value)
            gathered_item = lm.accelerator.gather(torch_device_tensor)
    
            if numitem > 0:
                gathered_filtered = gathered_item[gathered_item[:,0] != pad_value]
            else:
                gathered_filtered = gathered_item[gathered_item != pad_value]
                
            gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
    
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
    
        vals = vals_torch
240
241


242
243
244
245
246
247
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
            results[task_name][metric + " - filter=" + key] = task.aggregation()[metric](items)
Leo Gao's avatar
Leo Gao committed
248

249
250
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
251

252
253
254
255
256
257
258
259
260
            stderr = lm_eval.api.metrics.stderr_for_metric(
                metric=task.aggregation()[metric],
                bootstrap_iters=min(bootstrap_iters, 1000)
                if metric in ["bleu", "chrf", "ter"]
                else bootstrap_iters,
            )

            if stderr is not None:
                results[task_name][metric + " - filter=" + key + "_stderr"] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
261

262
        return {"results": dict(results), "versions": dict(versions)}
Fabrizio Milo's avatar
Fabrizio Milo committed
263

264
265
    else:
        return None