evaluator.py 12.4 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
import random
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
import json
lintangsutawika's avatar
lintangsutawika committed
4
import collections
FarzanehNakhaee's avatar
FarzanehNakhaee committed
5
6
import logging
import sys
lintangsutawika's avatar
lintangsutawika committed
7

8
9
import torch

10
import numpy as np
lintangsutawika's avatar
lintangsutawika committed
11
12

import lm_eval.api
13
import lm_eval.tasks
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.models
lintangsutawika's avatar
lintangsutawika committed
15
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
16
import lm_eval.api.registry
lintangsutawika's avatar
lintangsutawika committed
17

lintangsutawika's avatar
lintangsutawika committed
18
19
20
21
from lm_eval.utils import (
    positional_deprecated,
    run_task_tests,
    make_table,
22
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
23
24
    get_git_commit_hash,
)
25

lintangsutawika's avatar
lintangsutawika committed
26
27
from lm_eval.logger import eval_logger

FarzanehNakhaee's avatar
FarzanehNakhaee committed
28
29
30
31
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

Fabrizio Milo's avatar
Fabrizio Milo committed
32

33
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
34
35
36
37
38
39
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
    num_fewshot=0,
    batch_size=None,
40
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
41
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
42
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
43
44
45
46
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
47
48
    write_out=False,
    output_base_path=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
49
):
50
    """Instantiate and evaluate a model on a list of tasks.
51

52
53
54
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
55
        String arguments for each model class, see LM.create_from_arg_string.
56
57
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
58
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
59
60
    :param num_fewshot: int
        Number of examples in few-shot context
61
    :param batch_size: int or str, optional
62
        Batch size for model
63
64
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
65
    :param device: str, optional
66
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
67
68
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
69
70
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
71
72
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
73
74
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
75
    :param write_out: bool
76
        If True, write details about prompts and logits to json for all tasks
77
    :param output_base_path: str, optional
78
        Directory to which detailed eval info will be written. Defaults to present working dir.
79
    :return
80
        Dictionary of results
81
    """
82
83
84
    random.seed(1234)
    np.random.seed(1234)

85
86
87
    assert tasks != [], "No tasks specified"

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
88
89
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
90
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
91
92
93
94
95
96
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
97
        )
98
    else:
99
        assert isinstance(model, lm_eval.api.model.LM)
100
        lm = model
101

haileyschoelkopf's avatar
haileyschoelkopf committed
102
103
104
105
106
107
108
109
110
111
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

lintangsutawika's avatar
update  
lintangsutawika committed
112
    task_dict = lm_eval.tasks.get_task_dict(tasks, num_fewshot=num_fewshot)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
113

Stephen Hogg's avatar
Stephen Hogg committed
114
    if check_integrity:
115
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
116

117
118
119
120
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
121
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
122
        decontamination_ngrams_path=decontamination_ngrams_path,
123
124
        write_out=write_out,
        output_base_path=output_base_path,
125
    )
126

127
128
129
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
130
131
132
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
133
134
135
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
136
137
138
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
139
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
140
            "use_cache": use_cache,
141
142
143
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }
144
        results["git_hash"] = get_git_commit_hash()
145
146
147
        return results
    else:
        return None
148

Leo Gao's avatar
Leo Gao committed
149

150
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
151

Fabrizio Milo's avatar
Fabrizio Milo committed
152

153
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
154
155
156
157
158
159
def evaluate(
    lm,
    task_dict,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
160
161
    write_out=False,
    output_base_path=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
162
):
163
164
165
166
167
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
168
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
169
170
171
172
173
174
    :param num_fewshot: int
        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
175
    :param write_out: bool
176
        If True, write all prompts, logits and metrics to json for offline analysis
177
    :param output_base_path: str, optional
178
        Directory to which detailed eval info will be written. Defaults to present working dir
179
180
181
    :return
        Dictionary of results
    """
182

lintangsutawika's avatar
lintangsutawika committed
183
    # decontaminate = decontamination_ngrams_path is not None
184

Leo Gao's avatar
Leo Gao committed
185
    results = collections.defaultdict(dict)
Leo Gao's avatar
Leo Gao committed
186
    versions = collections.defaultdict(dict)
187
    configs = collections.defaultdict(dict)
lintangsutawika's avatar
lintangsutawika committed
188
    samples = collections.defaultdict(list)
Leo Gao's avatar
Leo Gao committed
189
190
    requests = collections.defaultdict(list)

lintangsutawika's avatar
lintangsutawika committed
191
    # docs = {}
Leo Gao's avatar
Leo Gao committed
192

193
    # get lists of each type of request
194
    for task_name, task in task_dict.items():
Leo Gao's avatar
Leo Gao committed
195
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
196
197
        configs[task_name] = dict(task.dump_config())

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
198
        if limit is not None:
199
200
201
202
203
204
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
205
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
206

207
208
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

209
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
210
211
212
213
214
215
        reqtype = (
            "loglikelihood"
            if task.OUTPUT_TYPE == "multiple_choice"
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
216
217

        if lm.world_size > 1:
218
219
220
221
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
222

223
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
224
            numpad = max(gathered_item) - gathered_item[lm.rank]
225

226
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
227
228
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
229
        eval_logger.info("Running {} requests".format(reqtype))
230
231
232
233
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
234

235
        if (lm.world_size > 1) and (numpad > 0):
236
237
238
            for _ in range(numpad):
                cloned_reqs.extend([req] * req.repeats)

239
240
241
242
243
244
245
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

246
247
248
    if lm.world_size > 1:
        lm.accelerator.wait_for_everyone()

249
250
251
252
253
254
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
255
256
257
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
258
    for task_name, task in task_dict.items():
haileyschoelkopf's avatar
haileyschoelkopf committed
259
260
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
261
        for key in task.instances[0].filtered_resps.keys():
262
263
264
265
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
266
                if task.has_test_docs()
267
268
269
270
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
lintangsutawika's avatar
lintangsutawika committed
271

272
            for doc_id, doc in doc_iterator:
273
274
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
275
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
276
277
278
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
FarzanehNakhaee's avatar
FarzanehNakhaee committed
279
                target = task.doc_to_target(doc)
280
                example = {
lintangsutawika's avatar
lintangsutawika committed
281
282
283
                    "doc_id": doc_id,
                    "doc": doc,
                    "target": target,
haileyschoelkopf's avatar
haileyschoelkopf committed
284
                    "arguments": req.args,
285
                    "resps": [req.resps for req in requests],
lintangsutawika's avatar
lintangsutawika committed
286
287
                    "filtered_resps": [req.filtered_resps[key] for req in requests],
                }
FarzanehNakhaee's avatar
FarzanehNakhaee committed
288
                example.update(metrics)
lintangsutawika's avatar
lintangsutawika committed
289
                samples[task_name].append(example)
290
291
292
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

293
    if lm.world_size > 1:
294
        # if multigpu, then gather data across all ranks
295
296
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
297
298

            numitem = 0
299
            if type(items[0]) == tuple:
300
301
                numitem = len(items[0])

302
303
            # distributed gather requires all ranks to have same dimensions
            # so we pad out with float32 min value
304
            pad_value = torch.finfo(torch.float32).min
305
306
307
308
309
310
            metrics_tensor = torch.tensor(items, device=lm.device)

            original_dtype = metrics_tensor.dtype  # store original dtype
            torch_device_tensor = lm.accelerator.pad_across_processes(
                metrics_tensor.to(torch.float32), pad_index=pad_value
            )
311
            gathered_item = lm.accelerator.gather(torch_device_tensor)
312

313
            if numitem > 0:
314
                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
315
316
            else:
                gathered_filtered = gathered_item[gathered_item != pad_value]
317
318
319
320

            gathered_item = (
                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
            )
321
322
323
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
324

325
326
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
327

328
        vals = vals_torch
329

330
331
332
333
334
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
335
            results[task_name][metric + "," + key] = task.aggregation()[metric](items)
Leo Gao's avatar
Leo Gao committed
336

337
338
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
339

lintangsutawika's avatar
lintangsutawika committed
340
            stderr = lm_eval.api.metrics.stderr_for_metric(
341
342
343
344
345
346
347
                metric=task.aggregation()[metric],
                bootstrap_iters=min(bootstrap_iters, 1000)
                if metric in ["bleu", "chrf", "ter"]
                else bootstrap_iters,
            )

            if stderr is not None:
348
                results[task_name][metric + "_stderr" + "," + key] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
349

350
351
352
353
        return {
            "results": dict(results),
            "configs": dict(configs),
            "versions": dict(versions),
lintangsutawika's avatar
lintangsutawika committed
354
            "samples": samples,
355
        }
Fabrizio Milo's avatar
Fabrizio Milo committed
356

357
358
    else:
        return None