"official/projects/basnet/evaluation/metrics.py" did not exist on "2b676a9b106ffdd305b63bd27b7f4d0b74036eb2"
evaluator.py 26.3 KB
Newer Older
Herbie Bradley's avatar
Herbie Bradley committed
1
import collections
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
4
import json
import logging
Herbie Bradley's avatar
Herbie Bradley committed
5
import random
FarzanehNakhaee's avatar
FarzanehNakhaee committed
6
import sys
lintangsutawika's avatar
lintangsutawika committed
7

8
import numpy as np
Herbie Bradley's avatar
Herbie Bradley committed
9
import torch
Herbie Bradley's avatar
Herbie Bradley committed
10
from accelerate.utils.operations import _gpu_gather
lintangsutawika's avatar
lintangsutawika committed
11
12

import lm_eval.api
lintangsutawika's avatar
lintangsutawika committed
13
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.api.registry
Herbie Bradley's avatar
Herbie Bradley committed
15
16
17
18
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.tasks
from lm_eval.logger import eval_logger
lintangsutawika's avatar
lintangsutawika committed
19
from lm_eval.utils import (
20
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
21
    eval_logger,
lintangsutawika's avatar
lintangsutawika committed
22
    get_git_commit_hash,
Herbie Bradley's avatar
Herbie Bradley committed
23
24
25
    make_table,
    positional_deprecated,
    run_task_tests,
lintangsutawika's avatar
lintangsutawika committed
26
)
27

FarzanehNakhaee's avatar
FarzanehNakhaee committed
28
29
30
31
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

Fabrizio Milo's avatar
Fabrizio Milo committed
32

33
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
34
35
36
37
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
38
    num_fewshot=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
39
    batch_size=None,
40
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
41
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
42
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
43
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
44
45
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
Fabrizio Milo's avatar
Fabrizio Milo committed
46
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
47
48
    write_out: bool = False,
    log_samples: bool = True,
Fabrizio Milo's avatar
Fabrizio Milo committed
49
):
50
    """Instantiate and evaluate a model on a list of tasks.
51

52
53
54
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
55
        String arguments for each model class, see LM.create_from_arg_string.
56
57
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
58
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
59
60
    :param num_fewshot: int
        Number of examples in few-shot context
61
    :param batch_size: int or str, optional
62
        Batch size for model
63
64
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
65
    :param device: str, optional
66
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
67
68
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
69
70
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
71
72
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
73
74
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
75
    :param write_out: bool
76
77
78
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
79
    :return
80
        Dictionary of results
81
    """
82
    random.seed(0)
83
    np.random.seed(1234)
84
85
86
    torch.manual_seed(
        1234
    )  # TODO: this may affect training runs that are run with evaluation mid-run.
87

88
89
90
    assert (
        tasks != []
    ), "No tasks specified, or no tasks found. Please verify the task names."
91
92

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
93
94
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
95
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
96
97
98
99
100
101
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
102
        )
103
    else:
104
        assert isinstance(model, lm_eval.api.model.LM)
105
        lm = model
106

haileyschoelkopf's avatar
haileyschoelkopf committed
107
108
109
110
111
112
113
114
115
116
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

117
118
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
lintangsutawika's avatar
lintangsutawika committed
119
120
121
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
122
123
            if task_obj is None:
                continue
lintangsutawika's avatar
lintangsutawika committed
124
125

        config = task_obj._config
126
127
128
129
130
131
132
        if num_fewshot is not None:
            if config["num_fewshot"] > 0:
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

Lintang Sutawika's avatar
Lintang Sutawika committed
133
            task_obj._config["num_fewshot"] = num_fewshot
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
134

Stephen Hogg's avatar
Stephen Hogg committed
135
    if check_integrity:
136
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
137

138
139
140
141
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
142
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
143
        decontamination_ngrams_path=decontamination_ngrams_path,
144
        write_out=write_out,
145
        log_samples=log_samples,
146
    )
147

148
149
150
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
151
152
153
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
154
155
            "model_args": model_args,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
156
157
158
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
159
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
160
            "use_cache": use_cache,
161
162
163
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }
164
        results["git_hash"] = get_git_commit_hash()
165
166
167
        return results
    else:
        return None
168

Leo Gao's avatar
Leo Gao committed
169

170
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
171

Fabrizio Milo's avatar
Fabrizio Milo committed
172

173
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
174
175
176
177
def evaluate(
    lm,
    task_dict,
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
178
    bootstrap_iters: int = 100000,
Fabrizio Milo's avatar
Fabrizio Milo committed
179
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
180
181
    write_out: bool = False,
    log_samples: bool = True,
Fabrizio Milo's avatar
Fabrizio Milo committed
182
):
183
184
185
186
187
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
haileyschoelkopf's avatar
haileyschoelkopf committed
188
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
189
190
191
192
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
193
    :param write_out: bool
194
195
196
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
197
198
199
    :return
        Dictionary of results
    """
200

lintangsutawika's avatar
lintangsutawika committed
201
    # decontaminate = decontamination_ngrams_path is not None
202

203
    # stores the final result for each task, for each metric/filter pair.
Leo Gao's avatar
Leo Gao committed
204
    results = collections.defaultdict(dict)
205
    # Tracks each task's version.
Leo Gao's avatar
Leo Gao committed
206
    versions = collections.defaultdict(dict)
207
    # Tracks the YAML configs of all chosen tasks.
208
    configs = collections.defaultdict(dict)
209
    # logs info about each document evaluated.
lintangsutawika's avatar
lintangsutawika committed
210
    samples = collections.defaultdict(list)
211
    # tracks all Instances/requests a model must generate output on.
Leo Gao's avatar
Leo Gao committed
212
    requests = collections.defaultdict(list)
213
    # Aggregated task scores presented with groups
214
    results_agg = collections.defaultdict(dict)
215
    # Aggregated groups scores only
lintangsutawika's avatar
lintangsutawika committed
216
    groups_agg = collections.defaultdict(dict)
217
218
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
219
    padding_requests = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
220
    # store the hierarchy to do proper ordering
lintangsutawika's avatar
lintangsutawika committed
221
    task_hierarchy = collections.defaultdict(list)
lintangsutawika's avatar
lintangsutawika committed
222
    # store the ordering of tasks and groups
lintangsutawika's avatar
lintangsutawika committed
223
    task_order = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
224
    task_group_alias = collections.defaultdict(dict)
225

226
    # get lists of each type of request
227
    for task_name, task in task_dict.items():
228
        if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
229
230
            group_name, task = task
            task_hierarchy[group_name].append(task_name)
231
            versions[group_name] = "N/A"
lintangsutawika's avatar
lintangsutawika committed
232

233
        else:
234
            group_name = None
lintangsutawika's avatar
lintangsutawika committed
235
236
237
238
            task_hierarchy[task_name] = []

        if task is None:
            continue
239

Leo Gao's avatar
Leo Gao committed
240
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
241
242
        configs[task_name] = dict(task.dump_config())

lintangsutawika's avatar
lintangsutawika committed
243
244
245
        if "task_alias" in configs[task_name]:
            task_group_alias[task_name] = configs[task_name]["task_alias"]

lintangsutawika's avatar
format  
lintangsutawika committed
246
247
248
249
        if (
            ("group_alias" in configs[task_name])
            and (group_name not in task_group_alias)
            and (group_name is not None)
lintangsutawika's avatar
lintangsutawika committed
250
251
252
        ):
            task_group_alias[group_name] = configs[task_name]["group_alias"]

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
253
        if limit is not None:
254
255
256
257
258
259
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
260
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
261

262
263
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

264
        eval_logger.debug(
haileyschoelkopf's avatar
haileyschoelkopf committed
265
266
267
268
269
270
            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
            for inst in task.instances:
                # print the prompt for the first few documents
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
271
272
                if inst.doc_id < 1:
                    eval_logger.info(
haileyschoelkopf's avatar
haileyschoelkopf committed
273
274
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
haileyschoelkopf's avatar
haileyschoelkopf committed
275
                    )
haileyschoelkopf's avatar
haileyschoelkopf committed
276
                    eval_logger.info(f"Request: {str(inst)}")
haileyschoelkopf's avatar
haileyschoelkopf committed
277

278
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
279
280
281
        for instance in task.instances:
            reqtype = instance.request_type
            requests[reqtype].append(instance)
282
283

        if lm.world_size > 1:
284
285
286
287
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
288

289
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
290
            numpad = max(gathered_item) - gathered_item[lm.rank]
291
            padding_requests[task.OUTPUT_TYPE] += numpad
292

293
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
294
295
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
296
        eval_logger.info("Running {} requests".format(reqtype))
297
298
299
300
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
301

302
303
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
304
305
                cloned_reqs.extend([req] * req.repeats)

306
307
308
309
310
311
312
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

313
314
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
315

316
317
318
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
319
320
        if type(task) == tuple:
            group, task = task
321
322
            if task is None:
                continue
323
324
325
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
326
327
328
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
329
    for task_name, task in task_dict.items():
330
331
        if type(task) == tuple:
            group, task = task
332
333
            if task is None:
                continue
haileyschoelkopf's avatar
haileyschoelkopf committed
334
335
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
336
        for key in task.instances[0].filtered_resps.keys():
Herbie Bradley's avatar
Herbie Bradley committed
337
            num_requests = 0
338
339
340
341
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
342
                if task.has_test_docs()
343
344
345
346
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
347
            for doc_id, doc in doc_iterator:
348
349
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
350
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
351
352
353
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
354
355
356
357
358
359
360
361
362
363
364
365
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
                        "doc_id": doc_id,
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
                        "filtered_resps": [req.filtered_resps[key] for req in requests],
                    }
                    example.update(metrics)
                    samples[task_name].append(example)
366
367
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
Herbie Bradley's avatar
Herbie Bradley committed
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
                num_requests += 1
        num_requests = torch.tensor(num_requests, device=lm.device)

    ### Aggregate results over all datapoints ###
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
        task = task_dict[task_name]
        if type(task) == tuple:
            group, task = task
        task_score = task.aggregation()[metric](items)
        results[task_name][metric + "," + key] = task_score

        # Need to put back in results
        # pythia | acc
        #        | perplexity
        #        | word_perplexity
        #        | byte_perplexity
        #        | bits_per_byte
        if bool(task_groups):
            group_name = task_groups[task_name]
            if metric not in aggregate[group_name]:
                aggregate[group_name][metric] = [task_score]
            else:
                aggregate[group_name][metric].append(task_score)

        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
        if bootstrap_iters > 0:
            stderr = lm_eval.api.metrics.stderr_for_metric(
                metric=task.aggregation()[metric],
                bootstrap_iters=min(bootstrap_iters, 1000)
                if metric in ["bleu", "chrf", "ter"]
                else bootstrap_iters,
            )

            if stderr is not None:
                results[task_name][metric + "_stderr" + "," + key] = stderr(items)

    if bool(aggregate):
        for group in aggregate.keys():
            for metric in aggregate[group].keys():
                aggregate[group][metric] = np.average(aggregate[group][metric])
                versions[group] = "N/A"

    results_dict = {
        "results": dict(sorted(results.items())),
        **({"aggregate": dict(sorted(aggregate.items()))} if bool(aggregate) else {}),
        "configs": dict(sorted(configs.items())),
        "versions": dict(sorted(versions.items())),
    }
    if log_samples:
        results_dict["samples"] = dict(samples)
    print("Rank: ", lm.rank, " Results: ", results_dict)
421

422
    if lm.world_size > 1:
423
        # if multigpu, then gather data across all ranks
424
425
426
427
428
429
430
431
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

            samples[task_name] = list(itertools.chain.from_iterable(full_samples))

        # then collect metrics across all ranks
432
433
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
434
            numitem = 0
435
            if type(items[0]) == tuple:
436
437
                numitem = len(items[0])

438
439
440
441
            if isinstance(items[0], (str, list)):
                # handle the string case
                gathered_items = [None] * lm.accelerator.num_processes
                torch.distributed.all_gather_object(gathered_items, items)
442

443
                gathered_item = list(itertools.chain.from_iterable(gathered_items))
444
            else:
445
446
447
448
449
                # distributed gather requires all ranks to have same dimensions
                # so we pad out with float32 min value
                pad_value = torch.finfo(torch.float32).min
                metrics_tensor = torch.tensor(items, device=lm.device)
                original_dtype = metrics_tensor.dtype  # store original dtype
Herbie Bradley's avatar
Herbie Bradley committed
450
                # Gather sizes
451
                torch_device_tensor = lm.accelerator.pad_across_processes(
452
453
454
455
                    metrics_tensor.to(torch.float32), pad_index=pad_value
                )

                gathered_item = lm.accelerator.gather(torch_device_tensor)
456
457
458
459
                if numitem > 0:
                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
                else:
                    gathered_filtered = gathered_item[gathered_item != pad_value]
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
                # gathered_sizes = lm.accelerator.gather(num_requests)
                # sizes = torch.stack(output_tensors)
                # if lm.rank == 0:
                #     print(gathered_sizes)
                # max_size = 26834
                # # Use max size to pad
                # metrics_tensor = metrics_tensor.to(torch.float32)
                # if max_size != metrics_tensor.shape[0]:
                #     old_size = metrics_tensor.shape
                #     new_size = list(old_size)
                #     new_size[0] = max_size
                #     device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
                #     indices = tuple(
                #         slice(0, old_size[0]) if i == 0 else slice(None)
                #         for i in range(len(new_size))
                #     )
                #     device_tensor[indices] = metrics_tensor
                # else:
                #     device_tensor = metrics_tensor
                # gathered_item = lm.accelerator.gather(device_tensor)
480
481
482
483
484
485
486

                gathered_item = (
                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
                )
                # reconvert if we were passed a tuple of values
                if numitem > 0:
                    gathered_item = [tuple(g) for g in gathered_item]
487

488
489
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
490

491
        vals = vals_torch
492

493
    if lm.rank == 0:
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
        ### Get task ordering for correct sample-wide aggregation
        group_to_task = {}
        for group in task_hierarchy.keys():
            if group not in task_order:
                task_order[group] = 0

            if len(task_hierarchy[group]) > 0:
                group_to_task[group] = task_hierarchy[group].copy()

            for task in task_hierarchy[group]:
                if task in task_order:
                    task_order[task] += 1
                else:
                    task_order[task] = 1 + task_order[group]

                if task in task_hierarchy:
                    group_to_task[group].remove(task)
                    group_to_task[group].extend(task_hierarchy[task])

        task_to_group = {}
        for group in group_to_task:
            for task in group_to_task[group]:
                if task in task_to_group:
                    task_to_group[task].append(group)
                else:
                    task_to_group[task] = [group]
lintangsutawika's avatar
lintangsutawika committed
520

521
522
523
524
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
lintangsutawika's avatar
lintangsutawika committed
525
526
            metric_key = metric + "," + key

527
            if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
528
529
530
531
                group_name, task = task
            else:
                group_name = None

532
            agg_fn = task.aggregation()[metric]
533
534
            results[task_name][metric_key] = agg_fn(items)
            results[task_name]["samples"] = len(items)
Leo Gao's avatar
Leo Gao committed
535

536
537
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
haileyschoelkopf's avatar
haileyschoelkopf committed
538
            if bootstrap_iters > 0:
haileyschoelkopf's avatar
haileyschoelkopf committed
539
540
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
haileyschoelkopf's avatar
haileyschoelkopf committed
541
                    bootstrap_iters=min(bootstrap_iters, 100)
haileyschoelkopf's avatar
haileyschoelkopf committed
542
543
544
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
545

haileyschoelkopf's avatar
haileyschoelkopf committed
546
547
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
548

lintangsutawika's avatar
lintangsutawika committed
549
        if bool(results):
550
            for group, task_list in reversed(task_hierarchy.items()):
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
                if task_list == []:
                    total_size = results[group]["samples"]
                else:
                    total_size = 0

                    for task in task_list:
                        metrics = results[task]

                        current_size = metrics.pop("samples")
                        # TODO: There should be a way for users
                        #       to toggle between weighted and
                        #       unweighted averaging
                        # For unweighted averaging, use:
                        #     current_size = 1

                        all_stderr = []
                        for metric in [
                            key for key in metrics.keys() if "_stderr" not in key
                        ]:
                            stderr = "_stderr,".join(metric.split(","))
                            stderr_score = results[task][stderr]
lintangsutawika's avatar
lintangsutawika committed
572
                            var_score = stderr_score**2
573
574
575
576
577
578
579
580
581
582
583
584
                            metric_score = results[task][metric]

                            all_stderr.append(stderr)

                            if metric in results[group]:
                                results[group][metric] = (
                                    results[group][metric] * total_size
                                    + metric_score * current_size
                                ) / (total_size + current_size)
                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
                                results[group][stderr] = (
                                    (total_size - 1) * results[group][stderr]
lintangsutawika's avatar
lintangsutawika committed
585
                                    + (current_size - 1) * var_score
586
587
588
589
590
591
592
593
594
595
                                ) / (
                                    total_size + current_size - 1
                                ) + total_size * current_size / (
                                    (total_size + current_size)
                                    * (total_size + current_size - 1)
                                ) * (
                                    results[group][metric] - metric_score
                                ) ** 2
                            else:
                                results[group][metric] = metric_score
lintangsutawika's avatar
lintangsutawika committed
596
                                results[group][stderr] = var_score
597
598
599
600
601

                        total_size += current_size

                    for stderr in all_stderr:
                        results[group][stderr] = np.sqrt(results[group][stderr])
lintangsutawika's avatar
lintangsutawika committed
602

603
                results[group]["samples"] = total_size
lintangsutawika's avatar
lintangsutawika committed
604

lintangsutawika's avatar
lintangsutawika committed
605
        def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
606
607
608
            results_agg = collections.defaultdict(dict)
            groups_agg = collections.defaultdict(dict)
            for group_name, task_list in task_hierarchy.items():
lintangsutawika's avatar
lintangsutawika committed
609
                order = task_order[group_name]
lintangsutawika's avatar
lintangsutawika committed
610
                results_agg[group_name] = results[group_name].copy()
lintangsutawika's avatar
lintangsutawika committed
611
                results_agg[group_name]["tab"] = order
612
613

                if (order < max(task_order.values())) and (len(task_list) > 0):
lintangsutawika's avatar
lintangsutawika committed
614
                    groups_agg[group_name] = results[group_name].copy()
lintangsutawika's avatar
lintangsutawika committed
615
                    groups_agg[group_name]["tab"] = order
616
617
618
619
620

                if task_list != []:
                    for task in sorted(task_list):
                        if task in task_hierarchy:
                            _task_hierarchy = {task: task_hierarchy[task]}
621
                        else:
622
                            _task_hierarchy = {task: []}
lintangsutawika's avatar
lintangsutawika committed
623

624
                        _results_agg, _groups_agg, task_version = print_tasks(
lintangsutawika's avatar
lintangsutawika committed
625
                            _task_hierarchy, task_order, task_version, task_group_alias
626
627
628
629
630
631
632
633
                        )

                        results_agg = {**results_agg, **_results_agg}
                        groups_agg = {**groups_agg, **_groups_agg}

            return results_agg, groups_agg, task_version

        results_agg, groups_agg, versions = print_tasks(
lintangsutawika's avatar
lintangsutawika committed
634
            task_hierarchy, task_order, versions, task_group_alias
635
        )
lintangsutawika's avatar
lintangsutawika committed
636

lintangsutawika's avatar
lintangsutawika committed
637
638
        for task in results_agg:
            task_results = results_agg[task]
lintangsutawika's avatar
lintangsutawika committed
639
640
641
642

            if "samples" in task_results:
                task_results.pop("samples")

lintangsutawika's avatar
lintangsutawika committed
643
            tab_string = ""
lintangsutawika's avatar
lintangsutawika committed
644
645
            if "tab" in task_results:
                tab = task_results.pop("tab")
lintangsutawika's avatar
lintangsutawika committed
646
                tab_string = " " * tab + "- " if tab > 0 else ""
lintangsutawika's avatar
lintangsutawika committed
647
648
649

            if task in task_group_alias:
                task_alias = task_group_alias[task]
650
                results_agg[task]["alias"] = tab_string + task_alias
lintangsutawika's avatar
lintangsutawika committed
651
            else:
652
                results_agg[task]["alias"] = tab_string + task
lintangsutawika's avatar
lintangsutawika committed
653
654
655

        for group in groups_agg:
            group_results = groups_agg[group]
lintangsutawika's avatar
lintangsutawika committed
656
657
658
659

            if "samples" in group_results:
                group_results.pop("samples")

lintangsutawika's avatar
lintangsutawika committed
660
            tab_string = ""
lintangsutawika's avatar
lintangsutawika committed
661
662
            if "tab" in group_results:
                tab = group_results.pop("tab")
lintangsutawika's avatar
lintangsutawika committed
663
                tab_string = " " * tab + "- " if tab > 0 else ""
lintangsutawika's avatar
lintangsutawika committed
664
665
666

            if group in task_group_alias:
                group_alias = task_group_alias[group]
667
                groups_agg[group]["alias"] = tab_string + group_alias
lintangsutawika's avatar
lintangsutawika committed
668
            else:
669
                groups_agg[group]["alias"] = tab_string + group
lintangsutawika's avatar
lintangsutawika committed
670

671
        results_dict = {
672
            "results": dict(results_agg.items()),
lintangsutawika's avatar
lintangsutawika committed
673
            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
674
675
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
676
        }
677
678
679
680
        if log_samples:
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
681

682
683
    else:
        return None