evaluator.py 16.3 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
import random
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
import json
lintangsutawika's avatar
lintangsutawika committed
4
import collections
FarzanehNakhaee's avatar
FarzanehNakhaee committed
5
6
import logging
import sys
lintangsutawika's avatar
lintangsutawika committed
7

8
9
import torch

10
import numpy as np
lintangsutawika's avatar
lintangsutawika committed
11
12

import lm_eval.api
13
import lm_eval.tasks
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.benchmarks
lintangsutawika's avatar
lintangsutawika committed
15
import lm_eval.models
lintangsutawika's avatar
lintangsutawika committed
16
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
17
import lm_eval.api.registry
lintangsutawika's avatar
lintangsutawika committed
18

lintangsutawika's avatar
lintangsutawika committed
19
20
21
22
from lm_eval.utils import (
    positional_deprecated,
    run_task_tests,
    make_table,
23
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
24
25
    get_git_commit_hash,
)
26

lintangsutawika's avatar
lintangsutawika committed
27
28
from lm_eval.logger import eval_logger

FarzanehNakhaee's avatar
FarzanehNakhaee committed
29
30
31
32
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

Fabrizio Milo's avatar
Fabrizio Milo committed
33

34
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
35
36
37
38
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
39
    num_fewshot=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
40
    batch_size=None,
41
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
42
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
43
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
44
45
46
47
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
48
    write_out=False,
49
    log_samples=True,
Fabrizio Milo's avatar
Fabrizio Milo committed
50
):
51
    """Instantiate and evaluate a model on a list of tasks.
52

53
54
55
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
56
        String arguments for each model class, see LM.create_from_arg_string.
57
58
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
59
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
60
61
    :param num_fewshot: int
        Number of examples in few-shot context
62
    :param batch_size: int or str, optional
63
        Batch size for model
64
65
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
66
    :param device: str, optional
67
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
68
69
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
70
71
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
72
73
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
74
75
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
76
    :param write_out: bool
77
78
79
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
80
    :return
81
        Dictionary of results
82
    """
83
    random.seed(0)
84
    np.random.seed(1234)
85
86
87
    torch.manual_seed(
        1234
    )  # TODO: this may affect training runs that are run with evaluation mid-run.
88

89
90
91
    assert (
        tasks != []
    ), "No tasks specified, or no tasks found. Please verify the task names."
92
93

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
94
95
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
96
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
97
98
99
100
101
102
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
103
        )
104
    else:
105
        assert isinstance(model, lm_eval.api.model.LM)
106
        lm = model
107

haileyschoelkopf's avatar
haileyschoelkopf committed
108
109
110
111
112
113
114
115
116
117
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

118
119
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
lintangsutawika's avatar
lintangsutawika committed
120
121
122
123
124
125

        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj

        config = task_obj._config
126
127
128
129
130
131
132
        if num_fewshot is not None:
            if config["num_fewshot"] > 0:
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

Lintang Sutawika's avatar
Lintang Sutawika committed
133
            task_obj._config["num_fewshot"] = num_fewshot
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
134

Stephen Hogg's avatar
Stephen Hogg committed
135
    if check_integrity:
136
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
137

138
139
140
141
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
142
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
143
        decontamination_ngrams_path=decontamination_ngrams_path,
144
        write_out=write_out,
145
        log_samples=log_samples,
146
    )
147

148
149
150
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
151
152
153
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
154
155
            "model_args": model_args,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
156
157
158
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
159
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
160
            "use_cache": use_cache,
161
162
163
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }
164
        results["git_hash"] = get_git_commit_hash()
165
166
167
        return results
    else:
        return None
168

Leo Gao's avatar
Leo Gao committed
169

170
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
171

Fabrizio Milo's avatar
Fabrizio Milo committed
172

173
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
174
175
176
177
178
179
def evaluate(
    lm,
    task_dict,
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
180
    write_out=False,
181
    log_samples=True,
Fabrizio Milo's avatar
Fabrizio Milo committed
182
):
183
184
185
186
187
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
Leo Gao's avatar
Leo Gao committed
188
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
189
190
191
192
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
193
    :param write_out: bool
194
195
196
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
197
198
199
    :return
        Dictionary of results
    """
200

lintangsutawika's avatar
lintangsutawika committed
201
    # decontaminate = decontamination_ngrams_path is not None
202

203
    # stores the final result for each task, for each metric/filter pair.
Leo Gao's avatar
Leo Gao committed
204
    results = collections.defaultdict(dict)
205
    # Tracks each task's version.
Leo Gao's avatar
Leo Gao committed
206
    versions = collections.defaultdict(dict)
207
    # Tracks the YAML configs of all chosen tasks.
208
    configs = collections.defaultdict(dict)
209
    # logs info about each document evaluated.
lintangsutawika's avatar
lintangsutawika committed
210
    samples = collections.defaultdict(list)
211
    # tracks all Instances/requests a model must generate output on.
Leo Gao's avatar
Leo Gao committed
212
    requests = collections.defaultdict(list)
Lintang Sutawika's avatar
Lintang Sutawika committed
213
    # Stores task scores based on task grouping.
lintangsutawika's avatar
lintangsutawika committed
214
    aggregate = collections.defaultdict(dict)
215
    # tracks if a task was chosen via user selecting a group containing it
216
    task_groups = collections.defaultdict(dict)
217
218
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
219
220
    padding_requests = collections.defaultdict(int)

Lintang Sutawika's avatar
Lintang Sutawika committed
221
    # Stores group related keys and values for group-aggregation
lintangsutawika's avatar
lintangsutawika committed
222
    task_groups = collections.defaultdict(dict)
223

224
    # get lists of each type of request
225
    for task_name, task in task_dict.items():
226
227
228

        if type(task) == tuple:
            group, task = task
229
            task_groups[task_name] = group
230
            aggregate[task_name] = {}
231

Leo Gao's avatar
Leo Gao committed
232
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
233
234
        configs[task_name] = dict(task.dump_config())

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
235
        if limit is not None:
236
237
238
239
240
241
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
242
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
243

244
245
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

haileyschoelkopf's avatar
haileyschoelkopf committed
246
247
248
249
250
251
252
        eval_logger.info(
            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
            for inst in task.instances:
                # print the prompt for the first few documents
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
253
254
                if inst.doc_id < 1:
                    eval_logger.info(
haileyschoelkopf's avatar
haileyschoelkopf committed
255
256
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
                    )
haileyschoelkopf's avatar
haileyschoelkopf committed
257
                    eval_logger.info(f"Request: {str(inst)}")
haileyschoelkopf's avatar
haileyschoelkopf committed
258

259
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
260
261
        reqtype = (
            "loglikelihood"
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
262
            if task.OUTPUT_TYPE == "multiple_choice"
lintangsutawika's avatar
lintangsutawika committed
263
264
265
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
266
267

        if lm.world_size > 1:
268
269
270
271
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
272

273
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
274
            numpad = max(gathered_item) - gathered_item[lm.rank]
275
            padding_requests[task.OUTPUT_TYPE] += numpad
276

277
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
278
279
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
280
        eval_logger.info("Running {} requests".format(reqtype))
281
282
283
284
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
285

286
287
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
288
289
                cloned_reqs.extend([req] * req.repeats)

290
291
292
293
294
295
296
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

297
298
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
299

300
301
302
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
303
304
        if type(task) == tuple:
            group, task = task
305
306
307
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
308
309
310
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
311
    for task_name, task in task_dict.items():
312
313
        if type(task) == tuple:
            group, task = task
haileyschoelkopf's avatar
haileyschoelkopf committed
314
315
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
316
        for key in task.instances[0].filtered_resps.keys():
317
318
319
320
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
321
                if task.has_test_docs()
322
323
324
325
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
326
            for doc_id, doc in doc_iterator:
327
328
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
329
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
330
331
332
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
333
334
335
336
337
338
339
340
341
342
343
344
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
                        "doc_id": doc_id,
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
                        "filtered_resps": [req.filtered_resps[key] for req in requests],
                    }
                    example.update(metrics)
                    samples[task_name].append(example)
345
346
347
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

348
    if lm.world_size > 1:
349
        # if multigpu, then gather data across all ranks
350
351
352
353
354
355
356
357
358
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):

            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

            samples[task_name] = list(itertools.chain.from_iterable(full_samples))

        # then collect metrics across all ranks
359
360
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
361
362

            numitem = 0
363
            if type(items[0]) == tuple:
364
365
                numitem = len(items[0])

366
367
            # distributed gather requires all ranks to have same dimensions
            # so we pad out with float32 min value
368
            pad_value = torch.finfo(torch.float32).min
369
370
371
372
373
374
            metrics_tensor = torch.tensor(items, device=lm.device)

            original_dtype = metrics_tensor.dtype  # store original dtype
            torch_device_tensor = lm.accelerator.pad_across_processes(
                metrics_tensor.to(torch.float32), pad_index=pad_value
            )
375
            gathered_item = lm.accelerator.gather(torch_device_tensor)
376

377
            if numitem > 0:
378
                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
379
380
            else:
                gathered_filtered = gathered_item[gathered_item != pad_value]
381
382
383
384

            gathered_item = (
                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
            )
385
386
387
            # reconvert if we were passed a tuple of values
            if numitem > 0:
                gathered_item = [tuple(g) for g in gathered_item]
388

389
390
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
391

392
        vals = vals_torch
393

394
395
396
397
398
    if lm.rank == 0:
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
399
400
            if type(task) == tuple:
                group, task = task
lintangsutawika's avatar
lintangsutawika committed
401
402
403
            task_score = task.aggregation()[metric](items)
            results[task_name][metric + "," + key] = task_score

404
405
406
407
408
409
            # Need to put back in results
            # pythia | acc
            #        | perplexity
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
410
            if task_name in task_groups:
411
                group_name = task_groups[task_name]
412
                if metric in list(aggregate[group_name].keys()):
413
                    aggregate[group_name][metric].append(task_score)
414
415
                else:
                    aggregate[group_name][metric] = [task_score]
Leo Gao's avatar
Leo Gao committed
416

417
418
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
haileyschoelkopf's avatar
haileyschoelkopf committed
419
420
421
422
423
424
425
            if bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
                    bootstrap_iters=min(bootstrap_iters, 1000)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
426

haileyschoelkopf's avatar
haileyschoelkopf committed
427
428
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
429

lintangsutawika's avatar
lintangsutawika committed
430
        if bool(aggregate):
431
432
433
434
            for group in aggregate.keys():
                for metric in aggregate[group].keys():
                    aggregate[group][metric] = np.average(aggregate[group][metric])
                    versions[group] = "N/A"
lintangsutawika's avatar
lintangsutawika committed
435

436
        results_dict = {
437
438
439
440
441
442
443
444
            "results": dict(sorted(results.items())),
            **(
                {"aggregate": dict(sorted(aggregate.items()))}
                if bool(aggregate)
                else {}
            ),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
445
        }
446
447
448
449
        if log_samples:
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
450

451
452
    else:
        return None