evaluator.py 31.1 KB
Newer Older
Baber Abbasi's avatar
Baber Abbasi committed
1
import itertools
2
import json
3
import logging
4
import os
Baber Abbasi's avatar
Baber Abbasi committed
5
import random
6
import time
7
from collections import defaultdict
8
from typing import TYPE_CHECKING, List, Optional, Union
Baber Abbasi's avatar
Baber Abbasi committed
9

10
import numpy as np
Baber Abbasi's avatar
Baber Abbasi committed
11
import torch
lintangsutawika's avatar
lintangsutawika committed
12

lintangsutawika's avatar
lintangsutawika committed
13
import lm_eval.api.metrics
lintangsutawika's avatar
lintangsutawika committed
14
import lm_eval.api.registry
Lintang Sutawika's avatar
Lintang Sutawika committed
15
import lm_eval.api.task
Baber Abbasi's avatar
Baber Abbasi committed
16
import lm_eval.models
17
from lm_eval.caching.cache import delete_cache
18
from lm_eval.evaluator_utils import (
Lintang Sutawika's avatar
Lintang Sutawika committed
19
    consolidate_group_results,
20
21
    consolidate_results,
    get_sample_size,
Lintang Sutawika's avatar
Lintang Sutawika committed
22
    get_subtask_list,
23
24
25
26
27
    get_task_list,
    prepare_print_tasks,
    print_writeout,
    run_task_tests,
)
KonradSzafer's avatar
KonradSzafer committed
28
from lm_eval.loggers import EvaluationTracker
29
from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
30
from lm_eval.tasks import TaskManager, get_task_dict
31
from lm_eval.utils import (
Baber's avatar
Baber committed
32
    get_logger,
33
    handle_non_serializable,
34
    hash_dict_images,
35
36
37
38
    hash_string,
    positional_deprecated,
    simple_parse_args_string,
)
39

Fabrizio Milo's avatar
Fabrizio Milo committed
40

41
42
if TYPE_CHECKING:
    from lm_eval.api.model import LM
Lintang Sutawika's avatar
Lintang Sutawika committed
43
    from lm_eval.api.task import Task
44

Lintang Sutawika's avatar
Lintang Sutawika committed
45
46
eval_logger = logging.getLogger(__name__)

47

48
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
49
def simple_evaluate(
50
51
52
53
54
55
56
57
58
59
60
61
62
    model,
    model_args: Optional[Union[str, dict]] = None,
    tasks: Optional[List[Union[str, dict, object]]] = None,
    num_fewshot: Optional[int] = None,
    batch_size: Optional[Union[int, str]] = None,
    max_batch_size: Optional[int] = None,
    device: Optional[str] = None,
    use_cache: Optional[str] = None,
    cache_requests: bool = False,
    rewrite_requests_cache: bool = False,
    delete_requests_cache: bool = False,
    limit: Optional[Union[int, float]] = None,
    samples: Optional[dict] = None,
Ethan Smith's avatar
Ethan Smith committed
63
    bootstrap_iters: int = 100000,
64
65
66
    check_integrity: bool = False,
    write_out: bool = False,
    log_samples: bool = True,
KonradSzafer's avatar
KonradSzafer committed
67
    evaluation_tracker: Optional[EvaluationTracker] = None,
68
69
70
71
    system_instruction: Optional[str] = None,
    apply_chat_template: Union[bool, str] = False,
    fewshot_as_multiturn: bool = False,
    gen_kwargs: Union[str, dict, None] = None,
72
    task_manager: Optional[TaskManager] = None,
73
74
75
76
77
78
79
80
    verbosity=None,
    predict_only: bool = False,
    random_seed: int = 0,
    numpy_random_seed: int = 1234,
    torch_random_seed: int = 1234,
    fewshot_random_seed: int = 1234,
    confirm_run_unsafe_code: bool = False,
    metadata: Optional[dict] = None,
Fabrizio Milo's avatar
Fabrizio Milo committed
81
):
82
    """Instantiate and evaluate a model on a list of tasks.
83

84
85
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
86
87
    :param model_args: Optional[str, dict]
        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
88
        Ignored if `model` argument is a LM object.
89
    :param tasks: list[Union[str, dict, Task]]
Leo Gao's avatar
Leo Gao committed
90
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
91
92
    :param num_fewshot: int
        Number of examples in few-shot context
93
    :param batch_size: int or str, optional
94
        Batch size for model
95
96
    :param max_batch_size: int, optional
        Maximal batch size to try with automatic batch size detection
97
    :param device: str, optional
98
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
haileyschoelkopf's avatar
haileyschoelkopf committed
99
100
    :param use_cache: str, optional
        A path to a sqlite db file for caching model responses. `None` if not caching.
101
102
103
    :param cache_requests: bool, optional
        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
    :param rewrite_requests_cache: bool, optional
Baber Abbasi's avatar
Baber Abbasi committed
104
        Rewrites all the request cache if set to `True`. `None` if not desired.
105
    :param delete_requests_cache: bool, optional
Baber Abbasi's avatar
Baber Abbasi committed
106
        Deletes all the request cache if set to `True`. `None` if not desired.
107
108
    :param limit: int or float, optional
        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
109
110
    :param samples: dictionary, optional
        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
111
    :param bootstrap_iters:
112
        Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
Stephen Hogg's avatar
Stephen Hogg committed
113
114
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
115
    :param write_out: bool
116
117
118
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
KonradSzafer's avatar
KonradSzafer committed
119
120
    :param system_instruction: str
        System instruction to be applied to the prompt
121
122
123
124
125
    :param apply_chat_template: Union[bool, str]
        Specifies whether to apply a chat template to the prompt.
        - If set to True, the default chat template is applied.
        - If set to a string, applies the specified chat template by name.
        Defaults to False (no chat template applied).
KonradSzafer's avatar
KonradSzafer committed
126
127
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
Baber Abbasi's avatar
Baber Abbasi committed
128
129
    :param gen_kwargs: dict or comma-separated string
        Arguments for model generation
130
        Ignored for all tasks with loglikelihood output_type
Baber Abbasi's avatar
Baber Abbasi committed
131
    :param verbosity: str
Lintang Sutawika's avatar
Lintang Sutawika committed
132
        Verbosity level for logging
Baber Abbasi's avatar
Baber Abbasi committed
133
134
    :param predict_only: bool
        If true only model outputs will be generated and returned. Metrics will not be evaluated
135
136
137
138
139
140
    :param random_seed: int
        Random seed for python's random module. If set to None, the seed will not be set.
    :param numpy_random_seed: int
        Random seed for numpy. If set to None, the seed will not be set.
    :param torch_random_seed: int
        Random seed for torch. If set to None, the seed will not be set.
141
142
    :param fewshot_random_seed: int
        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
Baber Abbasi's avatar
Baber Abbasi committed
143
144
145
    :param metadata: dict
        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
    return
146
        Dictionary of results
147
    """
148
    if verbosity is not None:
Baber's avatar
Baber committed
149
        get_logger(verbosity)
150
    start_date = time.time()
151

152
    if limit is not None and samples is not None:
153
154
155
156
        raise ValueError(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )

157
    _NEEDS_CHAT_TEMPLATE = ("inst", "chat")
158
    if (
159
160
161
162
        (
            isinstance(model_args, str)
            and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
        )
163
164
        or (
            isinstance(model_args, dict)
165
166
167
168
            and any(
                any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
                for v in model_args.values()
            )
169
170
        )
    ) and not apply_chat_template:
Baber Abbasi's avatar
Baber Abbasi committed
171
        eval_logger.warning(
172
173
            "Model appears to be an instruct or chat variant but chat template is not applied. "
            "Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
Baber Abbasi's avatar
Baber Abbasi committed
174
175
        )

176
    if delete_requests_cache:
177
178
179
        eval_logger.info("Deleting requests cache...")
        delete_cache()

180
    seed_message = []
181
    if random_seed is not None:
182
        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
183
184
        seed_message.append(f"Setting random seed to {random_seed}")
        random.seed(random_seed)
185

186
187
188
    if numpy_random_seed is not None:
        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
        np.random.seed(numpy_random_seed)
189

190
191
192
    if torch_random_seed is not None:
        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
        torch.manual_seed(torch_random_seed)
193

194
195
    if fewshot_random_seed is not None:
        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
196

197
198
199
    if seed_message:
        eval_logger.info(" | ".join(seed_message))

200
201
202
    if tasks is None:
        tasks = []
    if len(tasks) == 0:
203
204
205
        raise ValueError(
            "No tasks specified, or no tasks found. Please verify the task names."
        )
206

207
208
209
    if gen_kwargs is not None:
        if isinstance(gen_kwargs, str):
            gen_kwargs = simple_parse_args_string(gen_kwargs)
lintangsutawika's avatar
udate  
lintangsutawika committed
210
        eval_logger.warning(
211
            f"generation_kwargs: {gen_kwargs} specified through cli, these settings will update set parameters in yaml tasks. "
212
            "Ensure 'do_sample=True' for non-greedy decoding!"
lintangsutawika's avatar
udate  
lintangsutawika committed
213
        )
214
215
        if not gen_kwargs:
            gen_kwargs = None
lintangsutawika's avatar
lintangsutawika committed
216

217
218
    if isinstance(model, str):
        if model_args is None:
219
            eval_logger.warning("model_args not specified. Using defaults.")
220
            model_args = ""
221

222
        if isinstance(model_args, dict):
223
            eval_logger.info(
224
                f"Initializing {model} model, with arguments: {model_args}"
225
            )
226
227
            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
                model_args,
228
                {
229
230
231
                    "batch_size": batch_size,
                    "max_batch_size": max_batch_size,
                    "device": device,
232
233
234
235
                },
            )

        else:
236
            eval_logger.info(
237
                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
238
            )
239
240
            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
                model_args,
241
                {
242
243
244
                    "batch_size": batch_size,
                    "max_batch_size": max_batch_size,
                    "device": device,
245
246
                },
            )
247
    else:
248
        if not isinstance(model, lm_eval.api.model.LM):
249
            raise TypeError(
250
                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
251
            )
252
        eval_logger.info("Using pre-initialized model")
253
        lm = model
254

255
256
    if use_cache is not None:
        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
haileyschoelkopf's avatar
haileyschoelkopf committed
257
258
        lm = lm_eval.api.model.CachingLM(
            lm,
259
            use_cache
haileyschoelkopf's avatar
haileyschoelkopf committed
260
261
            # each rank receives a different cache db.
            # necessary to avoid multiple writes to cache at once
262
263
264
            + "_rank"
            + str(lm.rank)
            + ".db",
haileyschoelkopf's avatar
haileyschoelkopf committed
265
266
        )

267
    if task_manager is None:
268
269
270
271
272
273
274
275
        metadata = (
            simple_parse_args_string(model_args)
            if isinstance(model_args, str)
            else model_args
            if isinstance(model_args, dict)
            else {}
        ) | (metadata or {})
        task_manager = TaskManager(metadata=metadata)
276

Baber Abbasi's avatar
Baber Abbasi committed
277
    task_dict = get_task_dict(
278
        tasks,
Baber Abbasi's avatar
Baber Abbasi committed
279
280
        task_manager,
    )
Baber Abbasi's avatar
Baber Abbasi committed
281

Lintang Sutawika's avatar
Lintang Sutawika committed
282
283
284
285
286
287
288
289
290
291
    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
    def _adjust_config(task_dict):
        adjusted_task_dict = {}
        for task_name, task_obj in task_dict.items():
            if isinstance(task_obj, dict):
                adjusted_task_dict = {
                    **adjusted_task_dict,
                    **{task_name: _adjust_config(task_obj)},
                }
292

293
            else:
Lintang Sutawika's avatar
Lintang Sutawika committed
294
                if task_obj.get_config("output_type") == "generate_until":
295
                    if gen_kwargs is not None:
Lintang Sutawika's avatar
Lintang Sutawika committed
296
                        task_obj.set_config(
297
                            key="generation_kwargs", value=gen_kwargs, update=True
Lintang Sutawika's avatar
Lintang Sutawika committed
298
                        )
Baber Abbasi's avatar
Baber Abbasi committed
299
300
301
                    eval_logger.info(
                        f"{task_obj.config.task}: Using gen_kwargs: {task_obj.config.generation_kwargs}"
                    )
Lintang Sutawika's avatar
Lintang Sutawika committed
302

303
                if predict_only:
Lintang Sutawika's avatar
Lintang Sutawika committed
304
305
306
307
308
309
310
311
                    eval_logger.info(
                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
                    )
                    # we have to change the class properties post-hoc. This is pretty hacky.
                    task_obj.override_metric(metric_name="bypass")

                # override tasks' fewshot values to the provided num_fewshot arg value
                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
312
                if num_fewshot is not None:
Lintang Sutawika's avatar
Lintang Sutawika committed
313
314
315
316
317
318
                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
                        eval_logger.info(
                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
                        )
                    else:
                        eval_logger.warning(
319
                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
Lintang Sutawika's avatar
Lintang Sutawika committed
320
                        )
321
                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
Lintang Sutawika's avatar
Lintang Sutawika committed
322
323
324
325
326
327
328
                else:
                    # if num_fewshot not provided, and the task does not define a default one, default to 0
                    if (
                        default_num_fewshot := task_obj.get_config("num_fewshot")
                    ) is None:
                        task_obj.set_config(key="num_fewshot", value=0)
                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
329
                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
Lintang Sutawika's avatar
Lintang Sutawika committed
330
331
332
333
334
335

                adjusted_task_dict[task_name] = task_obj

        return adjusted_task_dict

    task_dict = _adjust_config(task_dict)
Jonathan Tow's avatar
Merge  
Jonathan Tow committed
336

337
338
    if check_integrity:
        run_task_tests(task_list=tasks)
Stephen Hogg's avatar
Stephen Hogg committed
339

KonradSzafer's avatar
KonradSzafer committed
340
341
    if evaluation_tracker is not None:
        evaluation_tracker.general_config_tracker.log_experiment_args(
342
343
344
345
346
            model_source=model,
            model_args=model_args,
            system_instruction=system_instruction,
            chat_template=lm.chat_template(apply_chat_template)
            if apply_chat_template
Baber Abbasi's avatar
Baber Abbasi committed
347
            else None,
348
            fewshot_as_multiturn=fewshot_as_multiturn,
KonradSzafer's avatar
KonradSzafer committed
349
350
        )

351
352
353
    results = evaluate(
        lm=lm,
        task_dict=task_dict,
354
355
356
357
        limit=limit,
        samples=samples,
        cache_requests=cache_requests,
        rewrite_requests_cache=rewrite_requests_cache,
Niklas Muennighoff's avatar
Niklas Muennighoff committed
358
        bootstrap_iters=bootstrap_iters,
359
360
361
362
363
364
365
        write_out=write_out,
        log_samples=True if predict_only else log_samples,
        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
        verbosity=verbosity,
        confirm_run_unsafe_code=confirm_run_unsafe_code,
366
    )
367

368
    if lm.rank == 0:
369
370
371
372
        if isinstance(model, str):
            model_name = model
        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
            model_name = model.config._name_or_path
373
        else:
374
            model_name = type(model).__name__
375

376
377
        # add info about the model and few shot config
        results["config"] = {
378
            "model": model_name,
379
            "model_args": model_args,
380
        }
381
382
383
384
385
386
        # add more detailed model info if available
        if isinstance(lm, lm_eval.models.huggingface.HFLM):
            results["config"].update(lm.get_model_info())
        # add info about execution
        results["config"].update(
            {
387
                "batch_size": batch_size,
388
389
390
                "batch_sizes": (
                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
                ),
391
392
393
                "device": device,
                "use_cache": use_cache,
                "limit": limit,
394
                "bootstrap_iters": bootstrap_iters,
395
396
397
398
399
                "gen_kwargs": gen_kwargs,
                "random_seed": random_seed,
                "numpy_seed": numpy_random_seed,
                "torch_seed": torch_random_seed,
                "fewshot_seed": fewshot_random_seed,
400
401
            }
        )
402
        results["git_hash"] = get_git_commit_hash()
403
        results["date"] = start_date
404
        add_env_info(results)  # additional environment info to results
achervyakov's avatar
achervyakov committed
405
        add_tokenizer_info(results, lm)  # additional info about tokenizer
406
407
408
        return results
    else:
        return None
409

Leo Gao's avatar
Leo Gao committed
410

411
@positional_deprecated
Fabrizio Milo's avatar
Fabrizio Milo committed
412
def evaluate(
413
    lm: "LM",
Fabrizio Milo's avatar
Fabrizio Milo committed
414
    task_dict,
Baber Abbasi's avatar
Baber Abbasi committed
415
    limit: Optional[int] = None,
416
    samples: Optional[dict] = None,
417
418
    cache_requests: bool = False,
    rewrite_requests_cache: bool = False,
Baber Abbasi's avatar
Baber Abbasi committed
419
    bootstrap_iters: Optional[int] = 100000,
Ethan Smith's avatar
Ethan Smith committed
420
421
    write_out: bool = False,
    log_samples: bool = True,
KonradSzafer's avatar
KonradSzafer committed
422
    system_instruction: Optional[str] = None,
423
    apply_chat_template: Union[bool, str] = False,
KonradSzafer's avatar
KonradSzafer committed
424
    fewshot_as_multiturn: bool = False,
425
    verbosity: str = "INFO",
Hojin Lee's avatar
Hojin Lee committed
426
    confirm_run_unsafe_code: bool = False,
Fabrizio Milo's avatar
Fabrizio Milo committed
427
):
428
429
430
431
432
    """Instantiate and evaluate a model on a list of tasks.

    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
haileyschoelkopf's avatar
haileyschoelkopf committed
433
        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
434
435
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
436
437
    :param samples: dictionary, optional
        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
Hojin Lee's avatar
Hojin Lee committed
438
439
440
441
    :param cache_requests: bool, optional
        Speed up evaluation by caching the building of dataset requests.
    :param rewrite_requests_cache: bool, optional
        Rewrites all the request cache if set to `True`.
442
    :param bootstrap_iters:
443
        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
444
    :param write_out: bool
445
446
447
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
KonradSzafer's avatar
KonradSzafer committed
448
449
    :param system_instruction: str
        System instruction to be applied to the prompt
450
451
452
453
454
    :param apply_chat_template: Union[bool, str]
        Specifies whether to apply a chat template to the prompt.
        - If set to True, the default chat template is applied.
        - If set to a string, applies the specified chat template by name.
        Defaults to False (no chat template applied).
KonradSzafer's avatar
KonradSzafer committed
455
456
    :param fewshot_as_multiturn: bool
        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
Hojin Lee's avatar
Hojin Lee committed
457
458
459
460
    :param verbosity: str
        Verbosity level for logging
    :param confirm_run_unsafe_code: bool
        Whether to confirm running tasks marked as unsafe.
461
462
463
    :return
        Dictionary of results
    """
464

465
466
467
468
469
470
    if limit is not None and samples is not None:
        raise ValueError(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )
    if samples is not None:
        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
471
472
473
474
    if apply_chat_template:
        eval_logger.warning(
            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
        )
475
    # tracks all Instances/requests a model must generate output on.
476
    requests = defaultdict(list)
477
478
    # stores the amount to pad out reqs per req. type so that
    # number of fwd passes per distributed rank is equal
479
    padding_requests = defaultdict(int)
480

481
    # get lists of group hierarchy and each type of request
Lintang Sutawika's avatar
Lintang Sutawika committed
482
    eval_tasks = get_task_list(task_dict)
483
    if not log_samples:
484
        if not all(
485
486
            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
            for task_output in eval_tasks
487
488
        ):
            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
489

Hojin Lee's avatar
Hojin Lee committed
490
491
492
    # validation checks:
    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
    # 2.are we running code that is marked as unsafe.
493
    incompatible_tasks = []
494
495
    for task_output in eval_tasks:
        task: Task = task_output.task
496

497
        if getattr(task, "MULTIMODAL", False) and not getattr(lm, "MULTIMODAL", False):
498
            incompatible_tasks.append(task_output.task_name)
Hojin Lee's avatar
Hojin Lee committed
499
500
501
502
        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
            raise ValueError(
                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
            )
503
504
505
506
507
    if len(incompatible_tasks) > 0:
        if not getattr(lm, "MULTIMODAL", False):
            raise ValueError(
                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
            )
Hojin Lee's avatar
Hojin Lee committed
508
    # end validation check
509

Chenjie Luo's avatar
Chenjie Luo committed
510
511
512
    # Cache the limit arg.
    limit_arg = limit
    limits = []
513
514
515
    for task_output in eval_tasks:
        task: Task = task_output.task

Chenjie Luo's avatar
Chenjie Luo committed
516
517
        limit = get_sample_size(task, limit_arg)
        limits.append(limit)
518
519
        task.build_all_requests(
            limit=limit,
520
521
522
            samples=samples.get(task_output.task_name, None)
            if samples is not None
            else samples,
523
524
525
526
            rank=lm.rank,
            world_size=lm.world_size,
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
KonradSzafer's avatar
KonradSzafer committed
527
            system_instruction=system_instruction,
528
            apply_chat_template=bool(apply_chat_template),
KonradSzafer's avatar
KonradSzafer committed
529
            fewshot_as_multiturn=fewshot_as_multiturn,
530
531
532
533
534
535
            chat_template=getattr(lm, "apply_chat_template")
            if apply_chat_template
            else None,
            tokenizer_name=getattr(lm, "tokenizer_name", "")
            if apply_chat_template
            else "",
536
        )
537
        eval_logger.debug(
538
            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
haileyschoelkopf's avatar
haileyschoelkopf committed
539
540
        )
        if write_out:
541
            print_writeout(task)
542
        # aggregate Instances by LM method requested to get output.
lintangsutawika's avatar
lintangsutawika committed
543
544
545
        for instance in task.instances:
            reqtype = instance.request_type
            requests[reqtype].append(instance)
546
547

        if lm.world_size > 1:
548
549
550
551
            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
            gathered_item = (
                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
            )
552
553
554
555
556
557
            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
            reqtype = (
                "loglikelihood"
                if task.OUTPUT_TYPE == "multiple_choice"
                else task.OUTPUT_TYPE
            )
558
            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
559
            numpad = max(gathered_item) - gathered_item[lm.rank]
560
561
            # todo: may not account for padding in cases like SquadV2 which has multiple req types
            padding_requests[reqtype] += numpad
562

563
    ### Run LM on inputs, get all outputs ###
Leo Gao's avatar
Leo Gao committed
564
565
    # execute each type of request
    for reqtype, reqs in requests.items():
566
        eval_logger.info(f"Running {reqtype} requests")
567
568
569
570
        # create `K` copies of each request `req` based off `K = req.repeats`
        cloned_reqs = []
        for req in reqs:
            cloned_reqs.extend([req] * req.repeats)
lintangsutawika's avatar
lintangsutawika committed
571

572
573
        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
            for _ in range(padding_requests[reqtype]):
574
575
                cloned_reqs.extend([req] * req.repeats)

576
577
578
579
580
581
582
        # run requests through model
        resps = getattr(lm, reqtype)(cloned_reqs)

        # put responses from model into a list of length K for each request.
        for x, req in zip(resps, cloned_reqs):
            req.resps.append(x)

583
584
        if lm.world_size > 1:
            lm.accelerator.wait_for_everyone()
585

586
587
    RANK = lm.rank
    WORLD_SIZE = lm.world_size
588
589
    ### Postprocess outputs ###
    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
Chenjie Luo's avatar
Chenjie Luo committed
590
    for task_output, limit in zip(eval_tasks, limits):
591
        task = task_output.task
592
593
        task.apply_filters()

594
595
        ### Collect values of metrics on all datapoints ###
        # # unpack results and sort back in order and return control to Task
haileyschoelkopf's avatar
haileyschoelkopf committed
596
        # TODO: make it possible to use a different metric per filter
597
        # Pre-process task.instances to group by doc_id
598
        instances_by_doc_id = defaultdict(list)
599
600
601
602
603
        for instance in task.instances:
            instances_by_doc_id[instance.doc_id].append(instance)
        # Sort instances within each group
        for instances in instances_by_doc_id.values():
            instances.sort(key=lambda x: x.idx)
haileyschoelkopf's avatar
haileyschoelkopf committed
604
        # iterate over different filters used
605
        for filter_key in task.instances[0].filtered_resps.keys():
606
607
608
609
610
            indices = (
                samples.get(task_output.task_name, None)
                if samples is not None
                else None
            )
611
            doc_iterator = task.doc_iterator(
612
613
614
615
                rank=RANK,
                limit=limit,
                world_size=WORLD_SIZE,
                samples=indices,
616
            )
617
            for doc_id, doc in doc_iterator:
618
619
620
621
                if indices:
                    doc_id_true = indices[doc_id]
                else:
                    doc_id_true = doc_id
622
                requests = instances_by_doc_id[doc_id]
lintangsutawika's avatar
lintangsutawika committed
623
                metrics = task.process_results(
624
                    doc, [req.filtered_resps[filter_key] for req in requests]
lintangsutawika's avatar
lintangsutawika committed
625
                )
626
627
628
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
629
                        "doc_id": doc_id_true,
630
631
632
633
                        "doc": doc,
                        "target": target,
                        "arguments": [req.args for req in requests],
                        "resps": [req.resps for req in requests],
634
635
636
                        "filtered_resps": [
                            req.filtered_resps[filter_key] for req in requests
                        ],
637
638
                        "filter": filter_key,
                        "metrics": list(metrics.keys()),
639
640
641
642
643
644
645
646
647
648
                        "doc_hash": hash_string(
                            json.dumps(
                                requests[0].doc,
                                indent=2,
                                default=handle_non_serializable,
                                ensure_ascii=False,
                            )
                        ),
                        "prompt_hash": hash_string(requests[0].arguments[0]),
                        "target_hash": hash_string(str(target)),
649
650
                    }
                    example.update(metrics)
651
                    task_output.logged_samples.append(example)
652
                for metric, value in metrics.items():
653
                    task_output.sample_metrics[(metric, filter_key)].append(value)
654

655
656
    if WORLD_SIZE > 1:
        # if multigpu, then gather data across all ranks to rank 0
657
        # first gather logged samples across all ranks
658
659
660
661
662
663
664
665
        for task_output in eval_tasks:
            if log_samples:
                # for task_name, task_samples in list(samples.items()):
                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
                torch.distributed.gather_object(
                    obj=task_output.logged_samples,
                    object_gather_list=full_samples,
                    dst=0,
666
                )
667

668
669
670
671
                if RANK == 0:
                    task_output.logged_samples = list(
                        itertools.chain.from_iterable(full_samples)
                    )
672

673
674
675
676
677
678
679
            # then collect metrics across all ranks
            for metrics in task_output.sample_metrics:
                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
                torch.distributed.gather_object(
                    obj=task_output.sample_metrics[metrics],
                    object_gather_list=metric_list,
                    dst=0,
680
                )
681
682
683
684
                if RANK == 0:
                    task_output.sample_metrics[metrics] = list(
                        itertools.chain.from_iterable(metric_list)
                    )
685

686
    if RANK == 0:
687
688
        ### Aggregate results over all datapoints ###
        # aggregate results ; run bootstrap CIs
689
690
        for task_output in eval_tasks:
            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
691
692
693
694
695
696
697
698
        (
            results,
            samples,
            configs,
            versions,
            num_fewshot,
            higher_is_better,
        ) = consolidate_results(eval_tasks)
Fabrizio Milo's avatar
Fabrizio Milo committed
699

700
        ### Calculate group metrics ###
lintangsutawika's avatar
lintangsutawika committed
701
        if bool(results):
Lintang Sutawika's avatar
Lintang Sutawika committed
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
            results, versions, show_group_table, *_ = consolidate_group_results(
                results, versions, task_dict
            )

        results_agg, group_agg = prepare_print_tasks(task_dict, results)
        subtask_list = get_subtask_list(task_dict)

        # collect all higher_is_better values for metrics
        # in the group's subtasks.
        # TODO: clean this up ; unify with the below metric_list loop?
        _higher_is_better = {}
        for group, task_list in subtask_list.items():
            if (
                len(task_list) != 0
            ):  # subtask list will list "task_name": [] for solo tasks
717
718
719
720
                for task in task_list:
                    for m, h in higher_is_better[task].items():
                        if m not in _higher_is_better.keys():
                            _higher_is_better[m] = h
lintangsutawika's avatar
lintangsutawika committed
721

Lintang Sutawika's avatar
Lintang Sutawika committed
722
723
724
725
726
727
728
729
730
731
                        if (
                            m in _higher_is_better
                            and _higher_is_better[m] is not None
                            and _higher_is_better[m] != h
                        ):
                            eval_logger.warning(
                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
                            )
                            _higher_is_better[m] = None
                higher_is_better[group] = _higher_is_better
732

733
        results_dict = {
734
            "results": dict(results_agg.items()),
Lintang Sutawika's avatar
Lintang Sutawika committed
735
736
737
738
739
740
            **(
                {"groups": dict(group_agg.items())}
                if (bool(group_agg) & show_group_table)
                else {}
            ),
            "group_subtasks": dict(reversed(subtask_list.items())),
741
742
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
743
            "n-shot": dict(sorted(num_fewshot.items())),
744
            "higher_is_better": dict(sorted(higher_is_better.items())),
745
746
747
            "n-samples": {
                task_output.task_name: {
                    "original": len(task_output.task.eval_docs),
KonradSzafer's avatar
KonradSzafer committed
748
749
750
751
                    "effective": min(
                        limit if limit else len(task_output.task.eval_docs),
                        len(task_output.task.eval_docs),
                    ),
752
                }
Chenjie Luo's avatar
Chenjie Luo committed
753
                for task_output, limit in zip(eval_tasks, limits)
754
            },
755
        }
756
        if log_samples:
757
758
759
760
            # default: hash images
            samples = (
                hash_dict_images(samples)
                if os.environ.get("LMEVAL_HASHMM", "1") != "0"
Baber Abbasi's avatar
Baber Abbasi committed
761
                and (hasattr(lm, "MULTIMODAL"))
762
763
                else samples
            )
764
765
766
            results_dict["samples"] = dict(samples)

        return results_dict
Fabrizio Milo's avatar
Fabrizio Milo committed
767

768
769
    else:
        return None