__main__.py 20.7 KB
Newer Older
1
2
3
import argparse
import json
import logging
lintangsutawika's avatar
lintangsutawika committed
4
import os
5
import sys
6
from functools import partial
7
from pathlib import Path
haileyschoelkopf's avatar
haileyschoelkopf committed
8
from typing import Union
Leo Gao's avatar
Leo Gao committed
9

Baber's avatar
nit  
Baber committed
10
11
12
13
14
15
from lm_eval.api.eval_config import (
    EvaluationConfig,
    TrackExplicitAction,
    TrackExplicitStoreTrue,
)

Baber Abbasi's avatar
Baber Abbasi committed
16
17
18
19
20
21
22
23
24
25
26
27

def try_parse_json(value: str) -> Union[str, dict, None]:
    if value is None:
        return None
    try:
        return json.loads(value)
    except json.JSONDecodeError:
        if "{" in value:
            raise argparse.ArgumentTypeError(
                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
            )
        return value
Fabrizio Milo's avatar
Fabrizio Milo committed
28

29

30
31
32
def _int_or_none_list_arg_type(
    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    def parse_value(item):
        item = item.strip().lower()
        if item == "none":
            return None
        try:
            return int(item)
        except ValueError:
            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")

    items = [parse_value(v) for v in value.split(split_char)]
    num_items = len(items)

    if num_items == 1:
        # Makes downstream handling the same for single and multiple values
        items = items * max_len
48
    elif num_items < min_len or num_items > max_len:
49
50
51
        raise argparse.ArgumentTypeError(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
        )
52
53
54
55
56
57
58
59
60
    elif num_items != max_len:
        logging.warning(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
            "Missing values will be filled with defaults."
        )
        default_items = [parse_value(v) for v in defaults.split(split_char)]
        items.extend(
            default_items[num_items:]
        )  # extend items list with missing defaults
61
62
63
64

    return items


65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def check_argument_types(parser: argparse.ArgumentParser):
    """
    Check to make sure all CLI args are typed, raises error if not
    """
    for action in parser._actions:
        if action.dest != "help" and not action.const:
            if action.type is None:
                raise ValueError(
                    f"Argument '{action.dest}' doesn't have a type specified."
                )
            else:
                continue


def setup_parser() -> argparse.ArgumentParser:
lintangsutawika's avatar
lintangsutawika committed
80
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
81
    parser.add_argument(
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
        "--config",
        "-C",
        default=None,
        type=str,
        metavar="DIR/file.yaml",
        action=TrackExplicitAction,
        help="Path to config with all arguments for `lm-eval`",
    )
    parser.add_argument(
        "--model",
        "-m",
        type=str,
        default="hf",
        action=TrackExplicitAction,
        help="Name of model e.g. `hf`",
97
    )
lintangsutawika's avatar
lintangsutawika committed
98
99
    parser.add_argument(
        "--tasks",
Baber Abbasi's avatar
Baber Abbasi committed
100
        "-t",
lintangsutawika's avatar
lintangsutawika committed
101
        default=None,
102
        type=str,
103
        action=TrackExplicitAction,
104
        metavar="task1,task2",
105
        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
lintangsutawika's avatar
lintangsutawika committed
106
    )
107
108
    parser.add_argument(
        "--model_args",
Baber Abbasi's avatar
Baber Abbasi committed
109
        "-a",
110
        default="",
111
        action=TrackExplicitAction,
Baber Abbasi's avatar
Baber Abbasi committed
112
113
        type=try_parse_json,
        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
114
    )
lintangsutawika's avatar
lintangsutawika committed
115
    parser.add_argument(
116
        "--num_fewshot",
Baber Abbasi's avatar
Baber Abbasi committed
117
        "-f",
118
        type=int,
119
        default=None,
120
        action=TrackExplicitAction,
121
        metavar="N",
122
123
        help="Number of examples in few-shot context",
    )
124
125
    parser.add_argument(
        "--batch_size",
Baber Abbasi's avatar
Baber Abbasi committed
126
        "-b",
127
        type=str,
128
        action=TrackExplicitAction,
129
130
131
132
        default=1,
        metavar="auto|auto:N|N",
        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
    )
lintangsutawika's avatar
lintangsutawika committed
133
134
135
136
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
137
        action=TrackExplicitAction,
138
139
        metavar="N",
        help="Maximal batch size to try with --batch_size auto.",
lintangsutawika's avatar
lintangsutawika committed
140
    )
141
142
143
144
    parser.add_argument(
        "--device",
        type=str,
        default=None,
145
        action=TrackExplicitAction,
146
        help="Device to use (e.g. cuda, cuda:0, cpu).",
147
148
149
    )
    parser.add_argument(
        "--output_path",
Baber Abbasi's avatar
Baber Abbasi committed
150
        "-o",
151
152
        default=None,
        type=str,
153
        action=TrackExplicitAction,
154
        metavar="DIR|DIR/file.json",
Niccolò Ajroldi's avatar
Niccolò Ajroldi committed
155
        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
156
    )
lintangsutawika's avatar
lintangsutawika committed
157
158
    parser.add_argument(
        "--limit",
Baber Abbasi's avatar
Baber Abbasi committed
159
        "-L",
lintangsutawika's avatar
lintangsutawika committed
160
161
        type=float,
        default=None,
162
        action=TrackExplicitAction,
163
        metavar="N|0<N<1",
lintangsutawika's avatar
lintangsutawika committed
164
165
166
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
167
168
169
170
171
    parser.add_argument(
        "--samples",
        "-E",
        default=None,
        type=str,
172
        action=TrackExplicitAction,
173
174
175
        metavar="/path/to/json",
        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
    )
176
177
    parser.add_argument(
        "--use_cache",
Baber Abbasi's avatar
Baber Abbasi committed
178
        "-c",
179
        type=str,
180
        action=TrackExplicitAction,
181
        default=None,
182
        metavar="DIR",
183
184
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
185
186
187
188
    parser.add_argument(
        "--cache_requests",
        type=str,
        default=None,
189
        action=TrackExplicitAction,
190
191
192
        choices=["true", "refresh", "delete"],
        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
    )
193
194
    parser.add_argument(
        "--check_integrity",
195
        action=TrackExplicitStoreTrue,
196
        help="Whether to run the relevant part of the test suite for the tasks.",
197
198
199
    )
    parser.add_argument(
        "--write_out",
Baber Abbasi's avatar
Baber Abbasi committed
200
        "-w",
201
        action=TrackExplicitStoreTrue,
202
        default=False,
203
        help="Prints the prompt for the first few documents.",
204
205
206
    )
    parser.add_argument(
        "--log_samples",
Baber Abbasi's avatar
Baber Abbasi committed
207
        "-s",
208
        action=TrackExplicitStoreTrue,
209
        default=False,
210
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
211
    )
KonradSzafer's avatar
KonradSzafer committed
212
213
214
215
    parser.add_argument(
        "--system_instruction",
        type=str,
        default=None,
216
        action=TrackExplicitAction,
KonradSzafer's avatar
KonradSzafer committed
217
218
219
220
        help="System instruction to be used in the prompt",
    )
    parser.add_argument(
        "--apply_chat_template",
221
222
        type=str,
        nargs="?",
223
        action=TrackExplicitAction,
224
        const=True,
KonradSzafer's avatar
KonradSzafer committed
225
        default=False,
226
227
228
229
230
231
        help=(
            "If True, apply chat template to the prompt. "
            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
            "To apply a specific template from the available list of templates, provide the template name as an argument. "
            "E.g. `--apply_chat_template template_name`"
        ),
KonradSzafer's avatar
KonradSzafer committed
232
233
234
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
235
        action=TrackExplicitStoreTrue,
KonradSzafer's avatar
KonradSzafer committed
236
237
238
        default=False,
        help="If True, uses the fewshot as a multi-turn conversation",
    )
239
240
    parser.add_argument(
        "--show_config",
241
        action=TrackExplicitStoreTrue,
242
243
244
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
245
246
247
248
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
249
        action=TrackExplicitAction,
250
        metavar="DIR",
251
252
        help="Additional path to include if there are external tasks to include.",
    )
253
254
    parser.add_argument(
        "--gen_kwargs",
Baber Abbasi's avatar
Baber Abbasi committed
255
        type=try_parse_json,
256
        default=None,
257
        action=TrackExplicitAction,
USVSN Sai Prashanth's avatar
USVSN Sai Prashanth committed
258
        help=(
Baber Abbasi's avatar
Baber Abbasi committed
259
260
            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
lintangsutawika's avatar
lintangsutawika committed
261
262
263
        ),
    )
    parser.add_argument(
lintangsutawika's avatar
lintangsutawika committed
264
        "--verbosity",
Baber Abbasi's avatar
Baber Abbasi committed
265
266
        "-v",
        type=str.upper,
Lintang Sutawika's avatar
Lintang Sutawika committed
267
        default=None,
268
        action=TrackExplicitAction,
269
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
Lintang Sutawika's avatar
Lintang Sutawika committed
270
        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
271
    )
272
273
    parser.add_argument(
        "--wandb_args",
274
        type=str,
275
        default="",
276
        action=TrackExplicitAction,
277
278
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
279
280
281
282
    parser.add_argument(
        "--wandb_config_args",
        type=str,
        default="",
283
        action=TrackExplicitAction,
284
285
        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
    )
286
287
288
289
    parser.add_argument(
        "--hf_hub_log_args",
        type=str,
        default="",
290
        action=TrackExplicitAction,
291
292
        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
    )
Baber Abbasi's avatar
Baber Abbasi committed
293
294
295
    parser.add_argument(
        "--predict_only",
        "-x",
296
        action=TrackExplicitStoreTrue,
Baber Abbasi's avatar
Baber Abbasi committed
297
298
299
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
300
    default_seed_string = "0,1234,1234,1234"
301
302
    parser.add_argument(
        "--seed",
303
        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
304
        action=TrackExplicitAction,
305
        default=default_seed_string,  # for backward compatibility
306
        help=(
307
308
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
Sadra Barikbin's avatar
Sadra Barikbin committed
309
            "respectively, or a single integer to set the same seed for all four.\n"
310
311
312
313
314
            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
            "Here numpy's seed is not set since the second value is `None`.\n"
            "E.g, `--seed 42` sets all four seeds to 42."
315
316
        ),
    )
317
318
    parser.add_argument(
        "--trust_remote_code",
319
        action=TrackExplicitStoreTrue,
320
321
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
Hojin Lee's avatar
Hojin Lee committed
322
323
    parser.add_argument(
        "--confirm_run_unsafe_code",
324
        action=TrackExplicitStoreTrue,
Hojin Lee's avatar
Hojin Lee committed
325
326
        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
    )
Baber Abbasi's avatar
Baber Abbasi committed
327
328
329
330
    parser.add_argument(
        "--metadata",
        type=json.loads,
        default=None,
331
        action=TrackExplicitAction,
Baber Abbasi's avatar
Baber Abbasi committed
332
333
        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
    )
334
335
336
337
338
    return parser


def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
    check_argument_types(parser)
Jason Phang's avatar
Jason Phang committed
339
340
    return parser.parse_args()

Fabrizio Milo's avatar
Fabrizio Milo committed
341

haileyschoelkopf's avatar
haileyschoelkopf committed
342
343
344
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
345
346
        parser = setup_parser()
        args = parse_eval_args(parser)
haileyschoelkopf's avatar
haileyschoelkopf committed
347

Baber's avatar
nit  
Baber committed
348
    cfg = EvaluationConfig.from_cli(args)
349

350
351
352
353
354
355
356
357
358
359
    # defer loading `lm_eval` submodules for faster CLI load
    from lm_eval import evaluator, utils
    from lm_eval.evaluator import request_caching_arg_to_dict
    from lm_eval.loggers import EvaluationTracker, WandbLogger
    from lm_eval.tasks import TaskManager
    from lm_eval.utils import (
        handle_non_serializable,
        make_table,
    )

360
    if args.wandb_args:
Baber's avatar
nit  
Baber committed
361
        wandb_logger = WandbLogger(cfg.wandb_args, cfg.wandb_config_args)
362

Baber's avatar
nit  
Baber committed
363
    utils.setup_logging(cfg.verbosity)
Lintang Sutawika's avatar
Lintang Sutawika committed
364
    eval_logger = logging.getLogger(__name__)
haileyschoelkopf's avatar
haileyschoelkopf committed
365
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
Fabrizio Milo's avatar
Fabrizio Milo committed
366

367
    # update the evaluation tracker args with the output path and the HF token
Baber's avatar
nit  
Baber committed
368
369
    if cfg.output_path:
        cfg.hf_hub_log_args["output_path"] = cfg.output_path
370

371
    if os.environ.get("HF_TOKEN", None):
Baber's avatar
nit  
Baber committed
372
        cfg.hf_hub_log_args["token"] = os.environ.get("HF_TOKEN")
artemorloff's avatar
artemorloff committed
373

Baber's avatar
nit  
Baber committed
374
    evaluation_tracker_args = cfg.hf_hub_log_args
375
376
    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)

Baber's avatar
nit  
Baber committed
377
378
    if cfg.predict_only:
        cfg.log_samples = True
379

Baber's avatar
nit  
Baber committed
380
    if (cfg.log_samples or cfg.predict_only) and not cfg.output_path:
381
382
383
        raise ValueError(
            "Specify --output_path if providing --log_samples or --predict_only"
        )
Baber Abbasi's avatar
Baber Abbasi committed
384

Baber's avatar
nit  
Baber committed
385
    if cfg.fewshot_as_multiturn and cfg.apply_chat_template is False:
KonradSzafer's avatar
KonradSzafer committed
386
        raise ValueError(
387
            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
KonradSzafer's avatar
KonradSzafer committed
388
389
        )

Baber's avatar
nit  
Baber committed
390
391
    if cfg.include_path is not None:
        eval_logger.info(f"Including path: {cfg.include_path}")
392

Baber's avatar
nit  
Baber committed
393
394
    metadata = (cfg.model_args) | (cfg.metadata)
    cfg.metadata = metadata
Baber Abbasi's avatar
Baber Abbasi committed
395

artemorloff's avatar
artemorloff committed
396
    # task_manager = TaskManager(include_path=config["include_path"], metadata=metadata)
Baber's avatar
nit  
Baber committed
397
    task_manager = TaskManager(include_path=cfg.include_path, metadata=metadata)
Fabrizio Milo's avatar
Fabrizio Milo committed
398

Baber's avatar
nit  
Baber committed
399
    if "push_samples_to_hub" in evaluation_tracker_args and not cfg.log_samples:
400
401
402
403
        eval_logger.warning(
            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
        )

Baber's avatar
nit  
Baber committed
404
    if cfg.limit:
lintangsutawika's avatar
lintangsutawika committed
405
406
407
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
Fabrizio Milo's avatar
Fabrizio Milo committed
408
        )
artemorloff's avatar
artemorloff committed
409

Baber's avatar
nit  
Baber committed
410
411
412
413
    if cfg.samples:
        assert cfg.limit is None, "If --samples is not None, then --limit must be None."
        if (samples := Path(cfg.samples)).is_file():
            cfg.samples = json.loads(samples.read_text())
414
        else:
Baber's avatar
nit  
Baber committed
415
            cfg.samples = json.loads(cfg.samples)
lintangsutawika's avatar
lintangsutawika committed
416

Baber's avatar
nit  
Baber committed
417
    if cfg.tasks is None:
418
419
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
Baber's avatar
nit  
Baber committed
420
    elif cfg.tasks == "list":
421
422
        print(task_manager.list_all_tasks())
        sys.exit()
Baber's avatar
nit  
Baber committed
423
    elif cfg.tasks == "list_groups":
424
425
        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
        sys.exit()
Baber's avatar
nit  
Baber committed
426
    elif cfg.tasks == "list_tags":
427
428
        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
        sys.exit()
Baber's avatar
nit  
Baber committed
429
    elif cfg.tasks == "list_subtasks":
430
        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
Lintang Sutawika's avatar
Lintang Sutawika committed
431
        sys.exit()
Jason Phang's avatar
Jason Phang committed
432
    else:
Baber's avatar
nit  
Baber committed
433
        if os.path.isdir(cfg.tasks):
434
            import glob
435
436

            task_names = []
Baber's avatar
nit  
Baber committed
437
            yaml_path = os.path.join(cfg.tasks, "*.yaml")
438
            for yaml_file in glob.glob(yaml_path):
Baber's avatar
nit  
Baber committed
439
440
                cfg = utils.load_yaml_config(yaml_file)
                task_names.append(cfg)
441
        else:
Baber's avatar
nit  
Baber committed
442
            task_list = cfg.tasks.split(",")
443
444
            task_names = task_manager.match_tasks(task_list)
            for task in [task for task in task_list if task not in task_names]:
445
                if os.path.isfile(task):
Baber's avatar
nit  
Baber committed
446
447
                    cfg = utils.load_yaml_config(task)
                    task_names.append(cfg)
448
            task_missing = [
449
                task for task in task_list if task not in task_names and "*" not in task
450
            ]  # we don't want errors if a wildcard ("*") task name was used
lintangsutawika's avatar
lintangsutawika committed
451

baberabb's avatar
baberabb committed
452
453
454
455
            if task_missing:
                missing = ", ".join(task_missing)
                eval_logger.error(
                    f"Tasks were not found: {missing}\n"
lintangsutawika's avatar
lintangsutawika committed
456
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
baberabb's avatar
baberabb committed
457
458
                )
                raise ValueError(
459
                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
baberabb's avatar
baberabb committed
460
                )
Baber's avatar
nit  
Baber committed
461
        cfg.tasks = task_names
lintangsutawika's avatar
lintangsutawika committed
462

463
    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
Baber's avatar
nit  
Baber committed
464
    if cfg.trust_remote_code:
465
466
        eval_logger.info(
            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
467
        )
468
469
470
471
472
473
474
        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets

        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

Baber's avatar
nit  
Baber committed
475
        cfg.model_args["trust_remote_code"] = True
476
477
478
479
    (
        eval_logger.info(f"Selected Tasks: {task_names}")
        if eval_logger.getEffectiveLevel() >= logging.INFO
        else print(f"Selected Tasks: {task_names}")
Baber Abbasi's avatar
Baber Abbasi committed
480
    )
481

482
    request_caching_args = request_caching_arg_to_dict(
Baber's avatar
nit  
Baber committed
483
        cache_requests=cfg.cache_requests
484
    )
Baber's avatar
nit  
Baber committed
485
    cfg.request_caching_args = request_caching_args
artemorloff's avatar
artemorloff committed
486

487
    results = evaluator.simple_evaluate(
Baber's avatar
nit  
Baber committed
488
489
490
491
492
493
494
495
496
497
        model=cfg.model,
        model_args=cfg.model_args,
        tasks=cfg.tasks,
        num_fewshot=cfg.num_fewshot,
        batch_size=cfg.batch_size,
        max_batch_size=cfg.max_batch_size,
        device=cfg.device,
        use_cache=cfg.use_cache,
        cache_requests=cfg.request_caching_args.get("cache_requests", False),
        rewrite_requests_cache=cfg.request_caching_args.get(
498
499
            "rewrite_requests_cache", False
        ),
Baber's avatar
nit  
Baber committed
500
        delete_requests_cache=cfg.request_caching_args.get(
501
502
            "delete_requests_cache", False
        ),
Baber's avatar
nit  
Baber committed
503
504
505
506
507
        limit=cfg.limit,
        samples=cfg.samples,
        check_integrity=cfg.check_integrity,
        write_out=cfg.write_out,
        log_samples=cfg.log_samples,
KonradSzafer's avatar
KonradSzafer committed
508
        evaluation_tracker=evaluation_tracker,
Baber's avatar
nit  
Baber committed
509
510
511
512
        system_instruction=cfg.system_instruction,
        apply_chat_template=cfg.apply_chat_template,
        fewshot_as_multiturn=cfg.fewshot_as_multiturn,
        gen_kwargs=cfg.gen_kwargs,
513
        task_manager=task_manager,
Baber's avatar
nit  
Baber committed
514
515
516
517
518
519
520
521
        verbosity=cfg.verbosity,
        predict_only=cfg.predict_only,
        random_seed=cfg.seed[0] if cfg.seed else None,
        numpy_random_seed=cfg.seed[1] if cfg.seed else None,
        torch_random_seed=cfg.seed[2] if cfg.seed else None,
        fewshot_random_seed=cfg.seed[3] if cfg.seed else None,
        confirm_run_unsafe_code=cfg.confirm_run_unsafe_code,
        metadata=cfg.metadata,
522
    )
523

524
    if results is not None:
Baber's avatar
nit  
Baber committed
525
        if cfg.log_samples:
526
            samples = results.pop("samples")
527
        dumped = json.dumps(
528
            results, indent=2, default=handle_non_serializable, ensure_ascii=False
529
        )
Baber's avatar
nit  
Baber committed
530
        if cfg.show_config:
531
            print(dumped)
532

533
534
        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

535
        # Add W&B logging
Baber's avatar
nit  
Baber committed
536
        if cfg.wandb_args:
537
538
539
            try:
                wandb_logger.post_init(results)
                wandb_logger.log_eval_result()
Baber's avatar
nit  
Baber committed
540
                if cfg.log_samples:
541
542
543
544
                    wandb_logger.log_eval_samples(samples)
            except Exception as e:
                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

KonradSzafer's avatar
KonradSzafer committed
545
546
547
        evaluation_tracker.save_results_aggregated(
            results=results, samples=samples if args.log_samples else None
        )
548

Baber's avatar
nit  
Baber committed
549
        if cfg.log_samples:
550
            for task_name, _ in results["configs"].items():
551
552
553
                evaluation_tracker.save_results_samples(
                    task_name=task_name, samples=samples[task_name]
                )
lintangsutawika's avatar
lintangsutawika committed
554

555
556
557
558
559
560
        if (
            evaluation_tracker.push_results_to_hub
            or evaluation_tracker.push_samples_to_hub
        ):
            evaluation_tracker.recreate_metadata_card()

561
        print(
Baber's avatar
nit  
Baber committed
562
563
            f"{cfg.model} ({cfg.model_args}), gen_kwargs: ({cfg.gen_kwargs}), limit: {cfg.limit}, num_fewshot: {cfg.num_fewshot}, "
            f"batch_size: {cfg.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
564
        )
565
        print(make_table(results))
lintangsutawika's avatar
lintangsutawika committed
566
        if "groups" in results:
567
            print(make_table(results, "groups"))
Jason Phang's avatar
lib  
Jason Phang committed
568

Baber's avatar
nit  
Baber committed
569
        if cfg.wandb_args:
570
571
572
            # Tear down wandb run once all the logging is done.
            wandb_logger.run.finish()

573

Jason Phang's avatar
Jason Phang committed
574
if __name__ == "__main__":
haileyschoelkopf's avatar
haileyschoelkopf committed
575
    cli_evaluate()