evaluator.py 17.3 KB
Newer Older
Herbie Bradley's avatar
Herbie Bradley committed
1
import collections
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
4
import json
import logging
Herbie Bradley's avatar
Herbie Bradley committed
5
import random
FarzanehNakhaee's avatar
FarzanehNakhaee committed
6
import sys
lintangsutawika's avatar
lintangsutawika committed
7

Herbie Bradley's avatar
Herbie Bradley committed
8
import matplotlib.pyplot as plt
9
import numpy as np
Herbie Bradley's avatar
Herbie Bradley committed
10
import torch
lintangsutawika's avatar
lintangsutawika committed
11
12

import lm_eval.api
lintangsutawika's avatar
lintangsutawika committed
13
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.api.registry
Herbie Bradley's avatar
Herbie Bradley committed
15
16
17
18
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.tasks
from lm_eval.logger import eval_logger
lintangsutawika's avatar
lintangsutawika committed
19
from lm_eval.utils import (
20
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
21
    get_git_commit_hash,
Herbie Bradley's avatar
Herbie Bradley committed
22
23
24
    make_table,
    positional_deprecated,
    run_task_tests,
lintangsutawika's avatar
lintangsutawika committed
25
)
26

FarzanehNakhaee's avatar
FarzanehNakhaee committed
27
28
29
30
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

Fabrizio Milo's avatar
Fabrizio Milo committed
31

32
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
33
34
35
36
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
37
    num_fewshot=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
38
    batch_size=None,
39
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
40
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
41
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
42
43
44
45
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
46
    write_out=False,
47
    log_samples=True,
Fabrizio Milo's avatar
Fabrizio Milo committed
48
):
49
    """Instantiate and evaluate a model on a list of tasks.
50

51
52
53
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
54
        String arguments for each model class, see LM.create_from_arg_string.
55
56
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
57
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
58
59
    :param num_fewshot: int
        Number of examples in few-shot context
60
    :param batch_size: int or str, optional
61
        Batch size for model
62
63
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
64
    :param device: str, optional
65
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
66
67
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
68
69
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
70
71
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
72
73
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
74
    :param write_out: bool
75
76
77
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
78
    :return
79
        Dictionary of results
80
    """
81
    random.seed(0)
82
    np.random.seed(1234)
83
84
85
    torch.manual_seed(
        1234
    )  # TODO: this may affect training runs that are run with evaluation mid-run.
86

87
88
89
    assert (
        tasks != []
    ), "No tasks specified, or no tasks found. Please verify the task names."
90
91

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
92
93
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
94
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
95
96
97
98
99
100
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
101
        )
102
    else:
103
        assert isinstance(model, lm_eval.api.model.LM)
104
        lm = model
105

haileyschoelkopf's avatar
haileyschoelkopf committed
106
107
108
109
110
111
112
113
114
115
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

116
117
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
lintangsutawika's avatar
lintangsutawika committed
118
119
120
121
122
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj

        config = task_obj._config
123
124
125
126
127
128
129
        if num_fewshot is not None:
            if config["num_fewshot"] > 0:
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

Lintang Sutawika's avatar
Lintang Sutawika committed
130
            task_obj._config["num_fewshot"] = num_fewshot
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
131

Stephen Hogg's avatar
Stephen Hogg committed
132
    if check_integrity:
133
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
134

135
136
137
138
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
139
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
140
        decontamination_ngrams_path=decontamination_ngrams_path,
141
        write_out=write_out,
142
        log_samples=log_samples,
143
    )
144

145
146
147
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
148
149
150
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
151
152
            "model_args": model_args,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
153
154
155
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
156
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
157
            "use_cache": use_cache,
158
159
160
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }
161
        results["git_hash"] = get_git_commit_hash()
162
163
164
        return results
    else:
        return None
165

Leo Gao's avatar
Leo Gao committed
166

167
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
168

Fabrizio Milo's avatar
Fabrizio Milo committed
169

170
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
171
172
173
174
175
176
def evaluate(
    lm,
    task_dict,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
177
    write_out=False,
178
    log_samples=True,
Fabrizio Milo's avatar
Fabrizio Milo committed
179
):
180
181
182
183
184
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
185
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
186
187
188
189
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
190
    :param write_out: bool
191
192
193
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
194
195
196
    :return
        Dictionary of results
    """
197

lintangsutawika's avatar
lintangsutawika committed
198
    # decontaminate = decontamination_ngrams_path is not None
199

200
    # stores the final result for each task, for each metric/filter pair.
Leo Gao's avatar
Leo Gao committed
201
    results = collections.defaultdict(dict)
202
    # Tracks each task's version.
Leo Gao's avatar
Leo Gao committed
203
    versions = collections.defaultdict(dict)
204
    # Tracks the YAML configs of all chosen tasks.
205
    configs = collections.defaultdict(dict)
206
    # logs info about each document evaluated.
lintangsutawika's avatar
lintangsutawika committed
207
    samples = collections.defaultdict(list)
208
    # tracks all Instances/requests a model must generate output on.
Leo Gao's avatar
Leo Gao committed
209
    requests = collections.defaultdict(list)
Lintang Sutawika's avatar
Lintang Sutawika committed
210
    # Stores task scores based on task grouping.
lintangsutawika's avatar
lintangsutawika committed
211
    aggregate = collections.defaultdict(dict)
212
    # tracks if a task was chosen via user selecting a group containing it
213
    task_groups = collections.defaultdict(dict)
214
215
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
216
217
    padding_requests = collections.defaultdict(int)

Lintang Sutawika's avatar
Lintang Sutawika committed
218
    # Stores group related keys and values for group-aggregation
lintangsutawika's avatar
lintangsutawika committed
219
    aggregate = collections.defaultdict(dict)
lintangsutawika's avatar
lintangsutawika committed
220
    task_groups = collections.defaultdict(dict)
221

222
    # get lists of each type of request
223
    for task_name, task in task_dict.items():
224
225
        if type(task) == tuple:
            group, task = task
226
            task_groups[task_name] = group
227

Leo Gao's avatar
Leo Gao committed
228
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
229
230
        configs[task_name] = dict(task.dump_config())

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
231
        if limit is not None:
232
233
234
235
236
237
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
238
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
239

240
241
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

haileyschoelkopf's avatar
haileyschoelkopf committed
242
243
244
245
246
247
248
        eval_logger.info(
            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
            for inst in task.instances:
                # print the prompt for the first few documents
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
249
250
                if inst.doc_id < 1:
                    eval_logger.info(
haileyschoelkopf's avatar
haileyschoelkopf committed
251
252
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
                    )
haileyschoelkopf's avatar
haileyschoelkopf committed
253
                    eval_logger.info(f"Request: {str(inst)}")
haileyschoelkopf's avatar
haileyschoelkopf committed
254

255
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
256
257
        reqtype = (
            "loglikelihood"
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
258
            if task.OUTPUT_TYPE == "multiple_choice"
lintangsutawika's avatar
lintangsutawika committed
259
260
261
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
262
263

        if lm.world_size > 1:
264
265
266
267
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
268

269
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
270
            numpad = max(gathered_item) - gathered_item[lm.rank]
271
            padding_requests[task.OUTPUT_TYPE] += numpad
272

273
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
274
275
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
276
        eval_logger.info("Running {} requests".format(reqtype))
277
278
279
280
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
281

282
283
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
284
285
                cloned_reqs.extend([req] * req.repeats)

286
287
288
289
290
291
292
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

293
294
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
295

296
297
298
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
299
300
        if type(task) == tuple:
            group, task = task
301
302
303
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
304
305
306
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
307
    for task_name, task in task_dict.items():
308
309
        if type(task) == tuple:
            group, task = task
haileyschoelkopf's avatar
haileyschoelkopf committed
310
311
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
312
        for key in task.instances[0].filtered_resps.keys():
313
314
315
316
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
317
                if task.has_test_docs()
318
319
320
321
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
322
            for doc_id, doc in doc_iterator:
323
324
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
325
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
326
327
328
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
329
330
331
332
333
334
335
336
337
338
339
340
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
                        "doc_id": doc_id,
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
                        "filtered_resps": [req.filtered_resps[key] for req in requests],
                    }
                    example.update(metrics)
                    samples[task_name].append(example)
341
342
343
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

Herbie Bradley's avatar
Herbie Bradley committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
    calibs = sorted(task.calibrations, key=lambda x: x[0])

    def bin_list_into_subsets(input_list, num_subsets=10):
        subset_size = len(input_list) // num_subsets
        remainder = len(input_list) % num_subsets
        subsets = []
        start = 0
        for _ in range(num_subsets):
            subset_end = start + subset_size + (1 if remainder > 0 else 0)
            subsets.append(input_list[start:subset_end])
            start = subset_end
            remainder -= 1
        return subsets

    subsets = bin_list_into_subsets(calibs, 10)
    x_coords = [np.mean([x[0] for x in subset]) for subset in subsets]
    y_coords = [np.mean([x[1] for x in subset]) for subset in subsets]
    model_name = lm.config._name_or_path.split("/")[1]
    plt.plot(x_coords, y_coords, label=model_name)
    plt.plot([0, 1], [0, 1], linestyle="--", color="black")
    plt.xlabel("Probabilities")
    plt.ylabel("Frequences")
    plt.title("Calibration")
    plt.legend()
    plt.savefig(f"{model_name}-long.png")

370
    if lm.world_size > 1:
371
        # if multigpu, then gather data across all ranks
372
373
374
375
376
377
378
379
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

            samples[task_name] = list(itertools.chain.from_iterable(full_samples))

        # then collect metrics across all ranks
380
381
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
382
            numitem = 0
383
            if type(items[0]) == tuple:
384
385
                numitem = len(items[0])

386
387
            # distributed gather requires all ranks to have same dimensions
            # so we pad out with float32 min value
388
            pad_value = torch.finfo(torch.float32).min
389
390
391
392
393
394
            metrics_tensor = torch.tensor(items, device=lm.device)

            original_dtype = metrics_tensor.dtype  # store original dtype
            torch_device_tensor = lm.accelerator.pad_across_processes(
                metrics_tensor.to(torch.float32), pad_index=pad_value
            )
395
            gathered_item = lm.accelerator.gather(torch_device_tensor)
396

397
            if numitem > 0:
398
                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
399
400
            else:
                gathered_filtered = gathered_item[gathered_item != pad_value]
401
402
403
404

            gathered_item = (
                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
            )
405
406
407
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
408

409
410
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
411

412
        vals = vals_torch
413

414
415
416
417
418
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
419
420
            if type(task) == tuple:
                group, task = task
lintangsutawika's avatar
lintangsutawika committed
421
422
423
            task_score = task.aggregation()[metric](items)
            results[task_name][metric + "," + key] = task_score

424
425
426
427
428
429
            # Need to put back in results
            # pythia | acc
            #        | perplexity
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
430
431
432
433
434
435
            if bool(task_groups):
                group_name = task_groups[task_name]
                if metric not in aggregate[group_name]:
                    aggregate[group_name][metric] = [task_score]
                else:
                    aggregate[group_name][metric].append(task_score)
Leo Gao's avatar
Leo Gao committed
436

437
438
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
haileyschoelkopf's avatar
haileyschoelkopf committed
439
440
441
442
443
444
445
            if bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
                    bootstrap_iters=min(bootstrap_iters, 1000)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
446

haileyschoelkopf's avatar
haileyschoelkopf committed
447
448
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
449

lintangsutawika's avatar
lintangsutawika committed
450
        if bool(aggregate):
451
452
453
454
            for group in aggregate.keys():
                for metric in aggregate[group].keys():
                    aggregate[group][metric] = np.average(aggregate[group][metric])
                    versions[group] = "N/A"
lintangsutawika's avatar
lintangsutawika committed
455

456
        results_dict = {
457
458
459
460
461
462
463
464
            "results": dict(sorted(results.items())),
            **(
                {"aggregate": dict(sorted(aggregate.items()))}
                if bool(aggregate)
                else {}
            ),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
465
        }
466
467
468
469
        if log_samples:
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
470

471
472
    else:
        return None