"configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py" did not exist on "8c5dd99825fe464e0f73a008a9bc79caadc88014"
evaluator.py 22.8 KB
Newer Older
Herbie Bradley's avatar
Herbie Bradley committed
1
import collections
Leo Gao's avatar
Leo Gao committed
2
import itertools
FarzanehNakhaee's avatar
FarzanehNakhaee committed
3
4
import json
import logging
Herbie Bradley's avatar
Herbie Bradley committed
5
import random
FarzanehNakhaee's avatar
FarzanehNakhaee committed
6
import sys
lintangsutawika's avatar
lintangsutawika committed
7

8
import numpy as np
Herbie Bradley's avatar
Herbie Bradley committed
9
import torch
Herbie Bradley's avatar
Herbie Bradley committed
10
from accelerate.utils.operations import _gpu_gather
lintangsutawika's avatar
lintangsutawika committed
11
12

import lm_eval.api
lintangsutawika's avatar
lintangsutawika committed
13
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.api.registry
Herbie Bradley's avatar
Herbie Bradley committed
15
16
17
18
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.tasks
from lm_eval.logger import eval_logger
lintangsutawika's avatar
lintangsutawika committed
19
from lm_eval.utils import (
20
    create_iterator,
lintangsutawika's avatar
lintangsutawika committed
21
    get_git_commit_hash,
Herbie Bradley's avatar
Herbie Bradley committed
22
23
24
    make_table,
    positional_deprecated,
    run_task_tests,
lintangsutawika's avatar
lintangsutawika committed
25
)
26

FarzanehNakhaee's avatar
FarzanehNakhaee committed
27
28
29
30
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

Fabrizio Milo's avatar
Fabrizio Milo committed
31

32
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
33
34
35
36
def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
37
    num_fewshot=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
38
    batch_size=None,
39
    max_batch_size=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
40
    device=None,
haileyschoelkopf's avatar
haileyschoelkopf committed
41
    use_cache=None,
Fabrizio Milo's avatar
Fabrizio Milo committed
42
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
43
44
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
Fabrizio Milo's avatar
Fabrizio Milo committed
45
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
46
47
    write_out: bool = False,
    log_samples: bool = True,
Fabrizio Milo's avatar
Fabrizio Milo committed
48
):
49
    """Instantiate and evaluate a model on a list of tasks.
50

51
52
53
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
Fabrizio Milo's avatar
Fabrizio Milo committed
54
        String arguments for each model class, see LM.create_from_arg_string.
55
56
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
Leo Gao's avatar
Leo Gao committed
57
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
58
59
    :param num_fewshot: int
        Number of examples in few-shot context
60
    :param batch_size: int or str, optional
61
        Batch size for model
62
63
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
64
    :param device: str, optional
65
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
66
67
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
68
69
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
70
71
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
Stephen Hogg's avatar
Stephen Hogg committed
72
73
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
74
    :param write_out: bool
75
76
77
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
78
    :return
79
        Dictionary of results
80
    """
81
    random.seed(0)
82
    np.random.seed(1234)
83
84
85
    torch.manual_seed(
        1234
    )  # TODO: this may affect training runs that are run with evaluation mid-run.
86

87
88
89
    assert (
        tasks != []
    ), "No tasks specified, or no tasks found. Please verify the task names."
90
91

    if isinstance(model, str):
Fabrizio Milo's avatar
Fabrizio Milo committed
92
93
        if model_args is None:
            model_args = ""
lintangsutawika's avatar
lintangsutawika committed
94
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
lintangsutawika's avatar
lintangsutawika committed
95
96
97
98
99
100
            model_args,
            {
                "batch_size": batch_size,
                "max_batch_size": max_batch_size,
                "device": device,
            },
Fabrizio Milo's avatar
Fabrizio Milo committed
101
        )
102
    else:
103
        assert isinstance(model, lm_eval.api.model.LM)
104
        lm = model
105

haileyschoelkopf's avatar
haileyschoelkopf committed
106
107
108
109
110
111
112
113
114
115
    if use_cache is not None:
        print(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
        lm = lm_eval.api.model.CachingLM(
            lm,
            use_cache
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
            + "_rank" + str(lm.rank) + ".db",
        )

116
117
    task_dict = lm_eval.tasks.get_task_dict(tasks)
    for task_name in task_dict.keys():
lintangsutawika's avatar
lintangsutawika committed
118
119
120
        task_obj = task_dict[task_name]
        if type(task_obj) == tuple:
            group, task_obj = task_obj
121
122
            if task_obj is None:
                continue
lintangsutawika's avatar
lintangsutawika committed
123
124

        config = task_obj._config
125
126
127
128
129
130
131
        if num_fewshot is not None:
            if config["num_fewshot"] > 0:
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

Lintang Sutawika's avatar
Lintang Sutawika committed
132
            task_obj._config["num_fewshot"] = num_fewshot
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
133

Stephen Hogg's avatar
Stephen Hogg committed
134
    if check_integrity:
135
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
136

137
138
139
140
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
        limit=limit,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
141
        bootstrap_iters=bootstrap_iters,
Fabrizio Milo's avatar
Fabrizio Milo committed
142
        decontamination_ngrams_path=decontamination_ngrams_path,
143
        write_out=write_out,
144
        log_samples=log_samples,
145
    )
146

147
148
149
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
lintangsutawika's avatar
lintangsutawika committed
150
151
152
            "model": model
            if isinstance(model, str)
            else model.model.config._name_or_path,
153
154
            "model_args": model_args,
            "batch_size": batch_size,
lintangsutawika's avatar
lintangsutawika committed
155
156
157
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
            else [],
158
            "device": device,
haileyschoelkopf's avatar
haileyschoelkopf committed
159
            "use_cache": use_cache,
160
161
162
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
        }
163
        results["git_hash"] = get_git_commit_hash()
164
165
166
        return results
    else:
        return None
167

Leo Gao's avatar
Leo Gao committed
168

169
decontaminate_suffix = "_decontaminate"
Leo Gao's avatar
Leo Gao committed
170

Fabrizio Milo's avatar
Fabrizio Milo committed
171

172
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
173
174
175
176
def evaluate(
    lm,
    task_dict,
    limit=None,
Ethan Smith's avatar
Ethan Smith committed
177
    bootstrap_iters: int = 100000,
Fabrizio Milo's avatar
Fabrizio Milo committed
178
    decontamination_ngrams_path=None,
Ethan Smith's avatar
Ethan Smith committed
179
180
    write_out: bool = False,
    log_samples: bool = True,
Fabrizio Milo's avatar
Fabrizio Milo committed
181
):
182
183
184
185
186
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
haileyschoelkopf's avatar
haileyschoelkopf committed
187
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
188
189
190
191
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
192
    :param write_out: bool
193
194
195
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
196
197
198
    :return
        Dictionary of results
    """
199

lintangsutawika's avatar
lintangsutawika committed
200
    # decontaminate = decontamination_ngrams_path is not None
201

202
    # stores the final result for each task, for each metric/filter pair.
Leo Gao's avatar
Leo Gao committed
203
    results = collections.defaultdict(dict)
204
    # Tracks each task's version.
Leo Gao's avatar
Leo Gao committed
205
    versions = collections.defaultdict(dict)
206
    # Tracks the YAML configs of all chosen tasks.
207
    configs = collections.defaultdict(dict)
208
    # logs info about each document evaluated.
lintangsutawika's avatar
lintangsutawika committed
209
    samples = collections.defaultdict(list)
210
    # tracks all Instances/requests a model must generate output on.
Leo Gao's avatar
Leo Gao committed
211
    requests = collections.defaultdict(list)
212
    # Aggregated task scores presented with groups
213
    results_agg = collections.defaultdict(dict)
214
    # Aggregated groups scores only
lintangsutawika's avatar
lintangsutawika committed
215
    groups_agg = collections.defaultdict(dict)
216
217
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
218
    padding_requests = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
219
    # store the hierarchy to do proper ordering
lintangsutawika's avatar
lintangsutawika committed
220
    task_hierarchy = collections.defaultdict(list)
lintangsutawika's avatar
lintangsutawika committed
221
    # store the ordering of tasks and groups
lintangsutawika's avatar
lintangsutawika committed
222
    task_order = collections.defaultdict(int)
lintangsutawika's avatar
lintangsutawika committed
223
    # store the aggregation for aggregating across tasks in the same group
224
    sample_agg_fn = collections.defaultdict(dict)
225

226
    # get lists of each type of request
227
    for task_name, task in task_dict.items():
228
        if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
229
230
            group_name, task = task
            task_hierarchy[group_name].append(task_name)
231
        else:
lintangsutawika's avatar
lintangsutawika committed
232
233
234
235
            task_hierarchy[task_name] = []

        if task is None:
            continue
236

Leo Gao's avatar
Leo Gao committed
237
        versions[task_name] = task.VERSION
haileyschoelkopf's avatar
haileyschoelkopf committed
238
239
        configs[task_name] = dict(task.dump_config())

Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
240
        if limit is not None:
241
242
243
244
245
246
            if task.has_test_docs():
                task_docs = task.test_docs()
            elif task.has_validation_docs():
                task_docs = task.validation_docs()
            else:
                raise RuntimeError("Task has neither test_docs nor validation_docs")
247
            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
248

249
250
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

haileyschoelkopf's avatar
haileyschoelkopf committed
251
252
253
254
255
256
257
        eval_logger.info(
            f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
        )

        if write_out:
            for inst in task.instances:
                # print the prompt for the first few documents
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
258
259
                if inst.doc_id < 1:
                    eval_logger.info(
haileyschoelkopf's avatar
haileyschoelkopf committed
260
261
                        f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
haileyschoelkopf's avatar
haileyschoelkopf committed
262
                    )
haileyschoelkopf's avatar
haileyschoelkopf committed
263
                    eval_logger.info(f"Request: {str(inst)}")
haileyschoelkopf's avatar
haileyschoelkopf committed
264

265
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
266
267
        reqtype = (
            "loglikelihood"
Hailey Schoelkopf's avatar
Hailey Schoelkopf committed
268
            if task.OUTPUT_TYPE == "multiple_choice"
lintangsutawika's avatar
lintangsutawika committed
269
270
271
            else task.OUTPUT_TYPE
        )  # TODO: this is hacky, fix in task.py
        requests[reqtype].extend(task.instances)
272
273

        if lm.world_size > 1:
274
275
276
277
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
278

279
            # compute number of pseudobatches to pad with (FSDP/DDP require even batches among ranks)
280
            numpad = max(gathered_item) - gathered_item[lm.rank]
281
            padding_requests[task.OUTPUT_TYPE] += numpad
282

283
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
284
285
    # execute each type of request
    for reqtype, reqs in requests.items():
lintangsutawika's avatar
lintangsutawika committed
286
        eval_logger.info("Running {} requests".format(reqtype))
287
288
289
290
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
291

292
293
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
294
295
                cloned_reqs.extend([req] * req.repeats)

296
297
298
299
300
301
302
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

303
304
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
305

306
307
308
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
    for task_name, task in task_dict.items():
309
310
        if type(task) == tuple:
            group, task = task
311
312
            if task is None:
                continue
313
314
315
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
Leo Gao's avatar
Leo Gao committed
316
317
318
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
319
    for task_name, task in task_dict.items():
320
321
        if type(task) == tuple:
            group, task = task
322
323
            if task is None:
                continue
haileyschoelkopf's avatar
haileyschoelkopf committed
324
325
        # TODO: make it possible to use a different metric per filter
        # iterate over different filters used
326
        for key in task.instances[0].filtered_resps.keys():
Herbie Bradley's avatar
Herbie Bradley committed
327
            num_requests = 0
328
329
330
331
            doc_iterator = (
                itertools.islice(
                    enumerate(task.test_docs()), lm.rank, limit, lm.world_size
                )
lintangsutawika's avatar
lintangsutawika committed
332
                if task.has_test_docs()
333
334
335
336
                else itertools.islice(
                    enumerate(task.validation_docs()), lm.rank, limit, lm.world_size
                )
            )
337
            for doc_id, doc in doc_iterator:
338
339
                # subset instances to only this document id ; sort by idx
                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
340
                requests.sort(key=lambda x: x.idx)
lintangsutawika's avatar
lintangsutawika committed
341
342
343
                metrics = task.process_results(
                    doc, [req.filtered_resps[key] for req in requests]
                )
344
345
346
347
348
349
350
351
352
353
354
355
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
                        "doc_id": doc_id,
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
                        "filtered_resps": [req.filtered_resps[key] for req in requests],
                    }
                    example.update(metrics)
                    samples[task_name].append(example)
356
357
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)
Herbie Bradley's avatar
Herbie Bradley committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
                num_requests += 1
        num_requests = torch.tensor(num_requests, device=lm.device)

    ### Aggregate results over all datapoints ###
    # aggregate results ; run bootstrap CIs
    for (task_name, key, metric), items in vals.items():
        task = task_dict[task_name]
        if type(task) == tuple:
            group, task = task
        task_score = task.aggregation()[metric](items)
        results[task_name][metric + "," + key] = task_score

        # Need to put back in results
        # pythia | acc
        #        | perplexity
        #        | word_perplexity
        #        | byte_perplexity
        #        | bits_per_byte
        if bool(task_groups):
            group_name = task_groups[task_name]
            if metric not in aggregate[group_name]:
                aggregate[group_name][metric] = [task_score]
            else:
                aggregate[group_name][metric].append(task_score)

        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
        if bootstrap_iters > 0:
            stderr = lm_eval.api.metrics.stderr_for_metric(
                metric=task.aggregation()[metric],
                bootstrap_iters=min(bootstrap_iters, 1000)
                if metric in ["bleu", "chrf", "ter"]
                else bootstrap_iters,
            )

            if stderr is not None:
                results[task_name][metric + "_stderr" + "," + key] = stderr(items)

    if bool(aggregate):
        for group in aggregate.keys():
            for metric in aggregate[group].keys():
                aggregate[group][metric] = np.average(aggregate[group][metric])
                versions[group] = "N/A"

    results_dict = {
        "results": dict(sorted(results.items())),
        **({"aggregate": dict(sorted(aggregate.items()))} if bool(aggregate) else {}),
        "configs": dict(sorted(configs.items())),
        "versions": dict(sorted(versions.items())),
    }
    if log_samples:
        results_dict["samples"] = dict(samples)
    print("Rank: ", lm.rank, " Results: ", results_dict)
411

412
    if lm.world_size > 1:
413
        # if multigpu, then gather data across all ranks
414
415
416
417
418
419
420
421
        # first gather logged samples across all ranks
        for task_name, task_samples in list(samples.items()):
            full_samples = [None] * lm.world_size
            torch.distributed.all_gather_object(full_samples, task_samples)

            samples[task_name] = list(itertools.chain.from_iterable(full_samples))

        # then collect metrics across all ranks
422
423
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():
424
            numitem = 0
425
            if type(items[0]) == tuple:
426
427
                numitem = len(items[0])

428
429
430
431
            if isinstance(items[0], (str, list)):
                # handle the string case
                gathered_items = [None] * lm.accelerator.num_processes
                torch.distributed.all_gather_object(gathered_items, items)
432

433
                gathered_item = list(itertools.chain.from_iterable(gathered_items))
434
            else:
435
436
437
438
439
                # distributed gather requires all ranks to have same dimensions
                # so we pad out with float32 min value
                pad_value = torch.finfo(torch.float32).min
                metrics_tensor = torch.tensor(items, device=lm.device)
                original_dtype = metrics_tensor.dtype  # store original dtype
Herbie Bradley's avatar
Herbie Bradley committed
440
                # Gather sizes
441
                torch_device_tensor = lm.accelerator.pad_across_processes(
442
443
444
445
                    metrics_tensor.to(torch.float32), pad_index=pad_value
                )

                gathered_item = lm.accelerator.gather(torch_device_tensor)
446
447
448
449
                if numitem > 0:
                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
                else:
                    gathered_filtered = gathered_item[gathered_item != pad_value]
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
                # gathered_sizes = lm.accelerator.gather(num_requests)
                # sizes = torch.stack(output_tensors)
                # if lm.rank == 0:
                #     print(gathered_sizes)
                # max_size = 26834
                # # Use max size to pad
                # metrics_tensor = metrics_tensor.to(torch.float32)
                # if max_size != metrics_tensor.shape[0]:
                #     old_size = metrics_tensor.shape
                #     new_size = list(old_size)
                #     new_size[0] = max_size
                #     device_tensor = metrics_tensor.new_zeros(tuple(new_size)) + pad_value
                #     indices = tuple(
                #         slice(0, old_size[0]) if i == 0 else slice(None)
                #         for i in range(len(new_size))
                #     )
                #     device_tensor[indices] = metrics_tensor
                # else:
                #     device_tensor = metrics_tensor
                # gathered_item = lm.accelerator.gather(device_tensor)
470
471
472
473
474
475
476

                gathered_item = (
                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
                )
                # reconvert if we were passed a tuple of values
                if numitem > 0:
                    gathered_item = [tuple(g) for g in gathered_item]
477

478
479
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
480

481
        vals = vals_torch
482

483
    if lm.rank == 0:
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
        ### Get task ordering for correct sample-wide aggregation
        group_to_task = {}
        for group in task_hierarchy.keys():
            if group not in task_order:
                task_order[group] = 0

            if len(task_hierarchy[group]) > 0:
                group_to_task[group] = task_hierarchy[group].copy()

            for task in task_hierarchy[group]:
                if task in task_order:
                    task_order[task] += 1
                else:
                    task_order[task] = 1 + task_order[group]

                if task in task_hierarchy:
                    group_to_task[group].remove(task)
                    group_to_task[group].extend(task_hierarchy[task])

        task_to_group = {}
        for group in group_to_task:
            for task in group_to_task[group]:
                if task in task_to_group:
                    task_to_group[task].append(group)
                else:
                    task_to_group[task] = [group]
lintangsutawika's avatar
lintangsutawika committed
510

511
512
513
514
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
        for (task_name, key, metric), items in vals.items():
            task = task_dict[task_name]
lintangsutawika's avatar
lintangsutawika committed
515
516
            metric_key = metric + "," + key

517
            if type(task) == tuple:
lintangsutawika's avatar
lintangsutawika committed
518
519
520
521
                group_name, task = task
            else:
                group_name = None

522
523
            agg_fn = task.aggregation()[metric]
            task_score = agg_fn(items)
lintangsutawika's avatar
lintangsutawika committed
524
525

            if group_name is not None:
526
527
528
529
530
531
                sample_metric_key = metric + "(sample agg)," + key
                for grouping in task_to_group[task_name]:
                    if metric_key in results[grouping]:
                        results[grouping][metric_key].append(task_score)
                    else:
                        results[grouping][metric_key] = [task_score]
lintangsutawika's avatar
lintangsutawika committed
532

533
534
535
536
537
                    if sample_metric_key in results[grouping]:
                        results[grouping][sample_metric_key] += items
                    else:
                        results[grouping][sample_metric_key] = items.copy()
                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
lintangsutawika's avatar
lintangsutawika committed
538
539

            results[task_name][metric_key] = task_score
Leo Gao's avatar
Leo Gao committed
540

541
542
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
543
            if False:  # bootstrap_iters > 0:
haileyschoelkopf's avatar
haileyschoelkopf committed
544
545
546
547
548
549
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
                    bootstrap_iters=min(bootstrap_iters, 1000)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,
                )
550

haileyschoelkopf's avatar
haileyschoelkopf committed
551
552
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
Fabrizio Milo's avatar
Fabrizio Milo committed
553

lintangsutawika's avatar
lintangsutawika committed
554
555
556
557
        if bool(results):
            for task_or_group in results.keys():
                for metric in results[task_or_group].keys():
                    if type(results[task_or_group][metric]) == list:
558
                        if "(sample agg)" in metric:
lintangsutawika's avatar
lintangsutawika committed
559
560
561
                            results[task_or_group][metric] = sample_agg_fn[
                                task_or_group
                            ][metric](results[task_or_group][metric])
562
                        else:
lintangsutawika's avatar
lintangsutawika committed
563
564
565
                            results[task_or_group][metric] = np.average(
                                results[task_or_group][metric]
                            )
lintangsutawika's avatar
lintangsutawika committed
566
567
                        versions[task_or_group] = "N/A"

lintangsutawika's avatar
lintangsutawika committed
568
569
570
571
        for task_name, task in task_dict.items():
            if type(task) == tuple:
                group_name, task = task
                order = task_order[group_name]
lintangsutawika's avatar
lintangsutawika committed
572
                tabbed_name = "-" * order + group_name
lintangsutawika's avatar
lintangsutawika committed
573
574
575
576
577
578
                results_agg[tabbed_name] = results[group_name]
                versions[tabbed_name] = versions[group_name]
                if order == 0:
                    groups_agg[group_name] = results[group_name]

            order = task_order[task_name]
lintangsutawika's avatar
lintangsutawika committed
579
            tabbed_name = "-" * order + task_name
lintangsutawika's avatar
lintangsutawika committed
580
581
            results_agg[tabbed_name] = results[task_name]
            versions[tabbed_name] = versions[task_name]
lintangsutawika's avatar
lintangsutawika committed
582

583
        results_dict = {
584
            "results": dict(results_agg.items()),
lintangsutawika's avatar
lintangsutawika committed
585
            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
586
587
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
588
        }
589
590
591
592
        if log_samples:
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
593

594
595
    else:
        return None