evaluator.py 24.3 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
import random
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
import json
lintangsutawika's avatar
lintangsutawika committed
4
import collections
FarzanehNakhaee's avatar
FarzanehNakhaee committed
5
import sys
lintangsutawika's avatar
lintangsutawika committed
6

7
8
import torch

9
import numpy as np
lintangsutawika's avatar
lintangsutawika committed
10
11

import lm_eval.api
12
import lm_eval.tasks
lintangsutawika's avatar
lintangsutawika committed
13
import lm_eval.models
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
15
import lm_eval.api.registry
lintangsutawika's avatar
lintangsutawika committed
16

lintangsutawika's avatar
lintangsutawika committed
17
18
19
20
from lm_eval.utils import (
    positional_deprecated,
    run_task_tests,
    make_table,
21
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
22
    get_git_commit_hash,
lintangsutawika's avatar
lintangsutawika committed
23
    simple_parse_args_string,
lintangsutawika's avatar
lintangsutawika committed
24
    eval_logger,
lintangsutawika's avatar
lintangsutawika committed
25
)
26

Fabrizio Milo's avatar
Fabrizio Milo committed
27

28
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
29
30
31
32
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
33
    num_fewshot=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
34
    batch_size=None,
35
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
36
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
37
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
38
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
39
40
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
Fabrizio Milo's avatar
Fabrizio Milo committed
41
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
42
43
    write_out: bool = False,
    log_samples: bool = True,
lintangsutawika's avatar
lintangsutawika committed
44
    gen_kwargs: str = None,
Fabrizio Milo's avatar
Fabrizio Milo committed
45
):
46
    """Instantiate and evaluate a model on a list of tasks.
47

48
49
50
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
51
        String arguments for each model class, see LM.create_from_arg_string.
52
53
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
54
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
55
56
    :param num_fewshot: int
        Number of examples in few-shot context
57
    :param batch_size: int or str, optional
58
        Batch size for model
59
60
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
61
    :param device: str, optional
62
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
63
64
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
65
66
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
67
68
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
69
70
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
71
    :param write_out: bool
72
73
74
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
75
76
77
    :param gen_kwargs: str
        String arguments for model generation
        Ignored for all tasks with loglikelihood output_type
78
    :return
79
        Dictionary of results
80
    """
81
    random.seed(0)
82
    np.random.seed(1234)
83
84
85
    torch.manual_seed(
        1234
    )  # TODO: this may affect training runs that are run with evaluation mid-run.
86

87
88
89
    assert (
        tasks != []
    ), "No tasks specified, or no tasks found. Please verify the task names."
90

lintangsutawika's avatar
lintangsutawika committed
91
92
    if gen_kwargs is not None:
        gen_kwargs = simple_parse_args_string(gen_kwargs)
lintangsutawika's avatar
udate  
lintangsutawika committed
93
94
95
        eval_logger.warning(
            f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks."
        )
lintangsutawika's avatar
lintangsutawika committed
96
97
98
        if gen_kwargs == "":
            gen_kwargs = None

99
    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
100
101
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
102
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
103
104
105
106
107
108
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
109
        )
110
    else:
111
        assert isinstance(model, lm_eval.api.model.LM)
112
        lm = model
113

haileyschoelkopf's avatar
haileyschoelkopf committed
114
115
116
117
118
119
120
121
122
123
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

124
125
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
lintangsutawika's avatar
lintangsutawika committed
126
127
128
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
129
130
            if task_obj is None:
                continue
lintangsutawika's avatar
lintangsutawika committed
131
132

        config = task_obj._config
lintangsutawika's avatar
udate  
lintangsutawika committed
133
        if config["output_type"] == "generate_until" and gen_kwargs is not None:
lintangsutawika's avatar
lintangsutawika committed
134
            config["generation_kwargs"].update(gen_kwargs)
135

136
        if num_fewshot is not None:
137
138
139
140
            if config["num_fewshot"] == 0:
                eval_logger.info(
                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
                )
141
            else:
142
143
144
145
146
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

147
                task_obj._config["num_fewshot"] = num_fewshot
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
148

Stephen Hogg's avatar
Stephen Hogg committed
149
    if check_integrity:
150
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
151

152
153
154
155
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
156
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
157
        decontamination_ngrams_path=decontamination_ngrams_path,
158
        write_out=write_out,
159
        log_samples=log_samples,
160
    )
161

162
163
164
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
165
166
167
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
168
169
            "model_args": model_args,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
170
171
172
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
173
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
174
            "use_cache": use_cache,
175
176
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
lintangsutawika's avatar
lintangsutawika committed
177
            "gen_kwargs": gen_kwargs,
178
        }
179
        results["git_hash"] = get_git_commit_hash()
180
181
182
        return results
    else:
        return None
183

Leo Gao's avatar
Leo Gao committed
184

185
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
186

Fabrizio Milo's avatar
Fabrizio Milo committed
187

188
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
189
190
191
192
def evaluate(
    lm,
    task_dict,
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
193
    bootstrap_iters: int = 100000,
Fabrizio Milo's avatar
Fabrizio Milo committed
194
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
195
196
    write_out: bool = False,
    log_samples: bool = True,
Fabrizio Milo's avatar
Fabrizio Milo committed
197
):
198
199
200
201
202
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
haileyschoelkopf's avatar
haileyschoelkopf committed
203
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
204
205
206
207
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
208
    :param write_out: bool
209
210
211
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
212
213
214
    :return
        Dictionary of results
    """
215

lintangsutawika's avatar
lintangsutawika committed
216
    # decontaminate = decontamination_ngrams_path is not None
217

218
    # stores the final result for each task, for each metric/filter pair.
Leo Gao's avatar
Leo Gao committed
219
    results = collections.defaultdict(dict)
220
    # Tracks each task's version.
Leo Gao's avatar
Leo Gao committed
221
    versions = collections.defaultdict(dict)
222
    # Tracks the YAML configs of all chosen tasks.
223
    configs = collections.defaultdict(dict)
224
    # logs info about each document evaluated.
lintangsutawika's avatar
lintangsutawika committed
225
    samples = collections.defaultdict(list)
226
    # tracks all Instances/requests a model must generate output on.
Leo Gao's avatar
Leo Gao committed
227
    requests = collections.defaultdict(list)
228
    # Aggregated task scores presented with groups
229
    results_agg = collections.defaultdict(dict)
230
    # Aggregated groups scores only
lintangsutawika's avatar
lintangsutawika committed
231
    groups_agg = collections.defaultdict(dict)
232
233
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
234
    padding_requests = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
235
    # store the hierarchy to do proper ordering
lintangsutawika's avatar
lintangsutawika committed
236
    task_hierarchy = collections.defaultdict(list)
lintangsutawika's avatar
lintangsutawika committed
237
    # store the ordering of tasks and groups
lintangsutawika's avatar
lintangsutawika committed
238
    task_order = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
239
    task_group_alias = collections.defaultdict(dict)
240
241
    # store num-fewshot value per task
    num_fewshot = collections.defaultdict(int)
242

243
    # get lists of each type of request
244
    for task_name, task in task_dict.items():
245
        if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
246
247
            group_name, task = task
            task_hierarchy[group_name].append(task_name)
248
            versions[group_name] = "N/A"
lintangsutawika's avatar
lintangsutawika committed
249

250
        else:
251
            group_name = None
lintangsutawika's avatar
lintangsutawika committed
252
253
254
255
            task_hierarchy[task_name] = []

        if task is None:
            continue
256

Leo Gao's avatar
Leo Gao committed
257
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
258
259
        configs[task_name] = dict(task.dump_config())

260
261
262
        if "num_fewshot" in configs[task_name]:
            n_shot = configs[task_name]["num_fewshot"]
        else:
263
            n_shot = 0
264
265
        num_fewshot[task_name] = n_shot

lintangsutawika's avatar
lintangsutawika committed
266
267
268
        if "task_alias" in configs[task_name]:
            task_group_alias[task_name] = configs[task_name]["task_alias"]

lintangsutawika's avatar
format  
lintangsutawika committed
269
270
271
272
        if (
            ("group_alias" in configs[task_name])
            and (group_name not in task_group_alias)
            and (group_name is not None)
lintangsutawika's avatar
lintangsutawika committed
273
274
275
        ):
            task_group_alias[group_name] = configs[task_name]["group_alias"]

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
276
        if limit is not None:
277
278
279
280
281
282
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
283
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
284

285
286
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

287
        eval_logger.debug(
haileyschoelkopf's avatar
haileyschoelkopf committed
288
289
290
291
292
293
            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
            for inst in task.instances:
                # print the prompt for the first few documents
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
294
295
                if inst.doc_id < 1:
                    eval_logger.info(
haileyschoelkopf's avatar
haileyschoelkopf committed
296
297
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
haileyschoelkopf's avatar
haileyschoelkopf committed
298
                    )
haileyschoelkopf's avatar
haileyschoelkopf committed
299
                    eval_logger.info(f"Request: {str(inst)}")
haileyschoelkopf's avatar
haileyschoelkopf committed
300

301
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
302
303
304
        for instance in task.instances:
            reqtype = instance.request_type
            requests[reqtype].append(instance)
305
306

        if lm.world_size > 1:
307
308
309
310
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
311

312
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
313
            numpad = max(gathered_item) - gathered_item[lm.rank]
314
            padding_requests[task.OUTPUT_TYPE] += numpad
315

316
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
317
318
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
319
        eval_logger.info("Running {} requests".format(reqtype))
320
321
322
323
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
324

325
326
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
327
328
                cloned_reqs.extend([req] * req.repeats)

329
330
331
332
333
334
335
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

336
337
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
338

339
340
341
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
342
343
        if type(task) == tuple:
            group, task = task
344
345
            if task is None:
                continue
346
347
348
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
349
350
351
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
352
    for task_name, task in task_dict.items():
353
354
        if type(task) == tuple:
            group, task = task
355
356
            if task is None:
                continue
haileyschoelkopf's avatar
haileyschoelkopf committed
357
358
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
359
        for key in task.instances[0].filtered_resps.keys():
360
361
362
363
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
364
                if task.has_test_docs()
365
366
367
368
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
369
            for doc_id, doc in doc_iterator:
370
371
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
372
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
373
374
375
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
376
377
378
379
380
381
382
383
384
385
386
387
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
                        "doc_id": doc_id,
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
                        "filtered_resps": [req.filtered_resps[key] for req in requests],
                    }
                    example.update(metrics)
                    samples[task_name].append(example)
388
389
390
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

391
    if lm.world_size > 1:
392
        # if multigpu, then gather data across all ranks
393
394
395
396
397
398
399
400
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

            samples[task_name] = list(itertools.chain.from_iterable(full_samples))

        # then collect metrics across all ranks
401
402
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
403
            numitem = 0
404
            if type(items[0]) == tuple:
405
406
                numitem = len(items[0])

407
408
409
410
            if isinstance(items[0], (str, list)):
                # handle the string case
                gathered_items = [None] * lm.accelerator.num_processes
                torch.distributed.all_gather_object(gathered_items, items)
411

412
                gathered_item = list(itertools.chain.from_iterable(gathered_items))
413
            else:
414
415
416
417
418
419
420
421
422
423
                # distributed gather requires all ranks to have same dimensions
                # so we pad out with float32 min value
                pad_value = torch.finfo(torch.float32).min
                metrics_tensor = torch.tensor(items, device=lm.device)

                original_dtype = metrics_tensor.dtype  # store original dtype
                torch_device_tensor = lm.accelerator.pad_across_processes(
                    metrics_tensor.to(torch.float32), pad_index=pad_value
                )
                gathered_item = lm.accelerator.gather(torch_device_tensor)
424

425
426
427
428
429
430
431
432
433
434
435
                if numitem > 0:
                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
                else:
                    gathered_filtered = gathered_item[gathered_item != pad_value]

                gathered_item = (
                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
                )
                # reconvert if we were passed a tuple of values
                if numitem > 0:
                    gathered_item = [tuple(g) for g in gathered_item]
436

437
438
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
439

440
        vals = vals_torch
441

442
    if lm.rank == 0:
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
        ### Get task ordering for correct sample-wide aggregation
        group_to_task = {}
        for group in task_hierarchy.keys():
            if group not in task_order:
                task_order[group] = 0

            if len(task_hierarchy[group]) > 0:
                group_to_task[group] = task_hierarchy[group].copy()

            for task in task_hierarchy[group]:
                if task in task_order:
                    task_order[task] += 1
                else:
                    task_order[task] = 1 + task_order[group]

                if task in task_hierarchy:
                    group_to_task[group].remove(task)
                    group_to_task[group].extend(task_hierarchy[task])

        task_to_group = {}
        for group in group_to_task:
            for task in group_to_task[group]:
                if task in task_to_group:
                    task_to_group[task].append(group)
                else:
                    task_to_group[task] = [group]
lintangsutawika's avatar
lintangsutawika committed
469

470
471
472
473
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
lintangsutawika's avatar
lintangsutawika committed
474
475
            metric_key = metric + "," + key

476
            if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
477
478
479
480
                group_name, task = task
            else:
                group_name = None

481
            agg_fn = task.aggregation()[metric]
482
483
            results[task_name][metric_key] = agg_fn(items)
            results[task_name]["samples"] = len(items)
lintangsutawika's avatar
lintangsutawika committed
484

485
486
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
haileyschoelkopf's avatar
haileyschoelkopf committed
487
            if bootstrap_iters > 0:
haileyschoelkopf's avatar
haileyschoelkopf committed
488
489
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
haileyschoelkopf's avatar
haileyschoelkopf committed
490
                    bootstrap_iters=min(bootstrap_iters, 100)
haileyschoelkopf's avatar
haileyschoelkopf committed
491
492
493
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
494

lintangsutawika's avatar
lintangsutawika committed
495
                if stderr is not None and len(items) > 1:
haileyschoelkopf's avatar
haileyschoelkopf committed
496
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
lintangsutawika's avatar
lintangsutawika committed
497
498
                else:
                    results[task_name][metric + "_stderr" + "," + key] = "N/A"
Fabrizio Milo's avatar
Fabrizio Milo committed
499

lintangsutawika's avatar
lintangsutawika committed
500
        if bool(results):
501
            for group, task_list in reversed(task_hierarchy.items()):
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
                if task_list == []:
                    total_size = results[group]["samples"]
                else:
                    total_size = 0

                    for task in task_list:
                        metrics = results[task]

                        current_size = metrics.pop("samples")
                        # TODO: There should be a way for users
                        #       to toggle between weighted and
                        #       unweighted averaging
                        # For unweighted averaging, use:
                        #     current_size = 1

                        all_stderr = []
                        for metric in [
                            key for key in metrics.keys() if "_stderr" not in key
                        ]:
                            stderr = "_stderr,".join(metric.split(","))
                            stderr_score = results[task][stderr]
lintangsutawika's avatar
lintangsutawika committed
523
                            var_score = stderr_score**2
524
525
526
527
528
529
530
531
532
533
534
535
                            metric_score = results[task][metric]

                            all_stderr.append(stderr)

                            if metric in results[group]:
                                results[group][metric] = (
                                    results[group][metric] * total_size
                                    + metric_score * current_size
                                ) / (total_size + current_size)
                                # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
                                results[group][stderr] = (
                                    (total_size - 1) * results[group][stderr]
lintangsutawika's avatar
lintangsutawika committed
536
                                    + (current_size - 1) * var_score
537
538
539
540
541
542
543
544
545
546
                                ) / (
                                    total_size + current_size - 1
                                ) + total_size * current_size / (
                                    (total_size + current_size)
                                    * (total_size + current_size - 1)
                                ) * (
                                    results[group][metric] - metric_score
                                ) ** 2
                            else:
                                results[group][metric] = metric_score
lintangsutawika's avatar
lintangsutawika committed
547
                                results[group][stderr] = var_score
548
549
550
551
552

                        total_size += current_size

                    for stderr in all_stderr:
                        results[group][stderr] = np.sqrt(results[group][stderr])
lintangsutawika's avatar
lintangsutawika committed
553

554
                results[group]["samples"] = total_size
lintangsutawika's avatar
lintangsutawika committed
555

lintangsutawika's avatar
lintangsutawika committed
556
        def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
557
558
559
            results_agg = collections.defaultdict(dict)
            groups_agg = collections.defaultdict(dict)
            for group_name, task_list in task_hierarchy.items():
lintangsutawika's avatar
lintangsutawika committed
560
                order = task_order[group_name]
lintangsutawika's avatar
lintangsutawika committed
561
                results_agg[group_name] = results[group_name].copy()
lintangsutawika's avatar
lintangsutawika committed
562
                results_agg[group_name]["tab"] = order
563
564

                if (order < max(task_order.values())) and (len(task_list) > 0):
lintangsutawika's avatar
lintangsutawika committed
565
                    groups_agg[group_name] = results[group_name].copy()
lintangsutawika's avatar
lintangsutawika committed
566
                    groups_agg[group_name]["tab"] = order
567
568
569
570
571
572
573
574
575

                if task_list != []:
                    for task in sorted(task_list):
                        if task in task_hierarchy:
                            _task_hierarchy = {task: task_hierarchy[task]}
                        else:
                            _task_hierarchy = {task: []}

                        _results_agg, _groups_agg, task_version = print_tasks(
lintangsutawika's avatar
lintangsutawika committed
576
                            _task_hierarchy, task_order, task_version, task_group_alias
577
578
579
580
581
582
583
584
                        )

                        results_agg = {**results_agg, **_results_agg}
                        groups_agg = {**groups_agg, **_groups_agg}

            return results_agg, groups_agg, task_version

        results_agg, groups_agg, versions = print_tasks(
lintangsutawika's avatar
lintangsutawika committed
585
            task_hierarchy, task_order, versions, task_group_alias
586
        )
lintangsutawika's avatar
lintangsutawika committed
587

lintangsutawika's avatar
lintangsutawika committed
588
589
        for task in results_agg:
            task_results = results_agg[task]
lintangsutawika's avatar
lintangsutawika committed
590
591
592
593

            if "samples" in task_results:
                task_results.pop("samples")

lintangsutawika's avatar
lintangsutawika committed
594
            tab_string = ""
lintangsutawika's avatar
lintangsutawika committed
595
596
            if "tab" in task_results:
                tab = task_results.pop("tab")
lintangsutawika's avatar
lintangsutawika committed
597
                tab_string = " " * tab + "- " if tab > 0 else ""
lintangsutawika's avatar
lintangsutawika committed
598
599
600

            if task in task_group_alias:
                task_alias = task_group_alias[task]
601
                results_agg[task]["alias"] = tab_string + task_alias
lintangsutawika's avatar
lintangsutawika committed
602
            else:
603
                results_agg[task]["alias"] = tab_string + task
lintangsutawika's avatar
lintangsutawika committed
604
605
606

        for group in groups_agg:
            group_results = groups_agg[group]
lintangsutawika's avatar
lintangsutawika committed
607
608
609
610

            if "samples" in group_results:
                group_results.pop("samples")

lintangsutawika's avatar
lintangsutawika committed
611
            tab_string = ""
lintangsutawika's avatar
lintangsutawika committed
612
613
            if "tab" in group_results:
                tab = group_results.pop("tab")
lintangsutawika's avatar
lintangsutawika committed
614
                tab_string = " " * tab + "- " if tab > 0 else ""
lintangsutawika's avatar
lintangsutawika committed
615
616
617

            if group in task_group_alias:
                group_alias = task_group_alias[group]
618
                groups_agg[group]["alias"] = tab_string + group_alias
lintangsutawika's avatar
lintangsutawika committed
619
            else:
620
                groups_agg[group]["alias"] = tab_string + group
lintangsutawika's avatar
lintangsutawika committed
621

622
        for group_name, task_list in task_hierarchy.items():
Lintang Sutawika's avatar
Lintang Sutawika committed
623
624
            if task_list != []:
                num_fewshot[group_name] = num_fewshot[task_list[0]]
625

626
        results_dict = {
627
            "results": dict(results_agg.items()),
lintangsutawika's avatar
lintangsutawika committed
628
            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
629
630
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
631
            "n-shot": dict(sorted(num_fewshot.items())),
632
        }
633
634
635
636
        if log_samples:
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
637

638
639
    else:
        return None