__main__.py 20 KB
Newer Older
1
2
3
import argparse
import json
import logging
lintangsutawika's avatar
lintangsutawika committed
4
import os
5
import sys
6
from functools import partial
7
from pathlib import Path
haileyschoelkopf's avatar
haileyschoelkopf committed
8
from typing import Union
Leo Gao's avatar
Leo Gao committed
9

Baber Abbasi's avatar
Baber Abbasi committed
10
11
12
13
14
15
16
17
18
19
20
21

def try_parse_json(value: str) -> Union[str, dict, None]:
    if value is None:
        return None
    try:
        return json.loads(value)
    except json.JSONDecodeError:
        if "{" in value:
            raise argparse.ArgumentTypeError(
                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
            )
        return value
Fabrizio Milo's avatar
Fabrizio Milo committed
22

23

24
25
26
def _int_or_none_list_arg_type(
    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    def parse_value(item):
        item = item.strip().lower()
        if item == "none":
            return None
        try:
            return int(item)
        except ValueError:
            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")

    items = [parse_value(v) for v in value.split(split_char)]
    num_items = len(items)

    if num_items == 1:
        # Makes downstream handling the same for single and multiple values
        items = items * max_len
42
    elif num_items < min_len or num_items > max_len:
43
44
45
        raise argparse.ArgumentTypeError(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
        )
46
47
48
49
50
51
52
53
54
    elif num_items != max_len:
        logging.warning(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
            "Missing values will be filled with defaults."
        )
        default_items = [parse_value(v) for v in defaults.split(split_char)]
        items.extend(
            default_items[num_items:]
        )  # extend items list with missing defaults
55
56
57
58

    return items


59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def check_argument_types(parser: argparse.ArgumentParser):
    """
    Check to make sure all CLI args are typed, raises error if not
    """
    for action in parser._actions:
        if action.dest != "help" and not action.const:
            if action.type is None:
                raise ValueError(
                    f"Argument '{action.dest}' doesn't have a type specified."
                )
            else:
                continue


def setup_parser() -> argparse.ArgumentParser:
lintangsutawika's avatar
lintangsutawika committed
74
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
75
76
77
    parser.add_argument(
        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
    )
lintangsutawika's avatar
lintangsutawika committed
78
79
    parser.add_argument(
        "--tasks",
Baber Abbasi's avatar
Baber Abbasi committed
80
        "-t",
lintangsutawika's avatar
lintangsutawika committed
81
        default=None,
82
        type=str,
83
        metavar="task1,task2",
84
        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
lintangsutawika's avatar
lintangsutawika committed
85
    )
86
87
    parser.add_argument(
        "--model_args",
Baber Abbasi's avatar
Baber Abbasi committed
88
        "-a",
89
        default="",
Baber Abbasi's avatar
Baber Abbasi committed
90
91
        type=try_parse_json,
        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
92
    )
lintangsutawika's avatar
lintangsutawika committed
93
    parser.add_argument(
94
        "--num_fewshot",
Baber Abbasi's avatar
Baber Abbasi committed
95
        "-f",
96
        type=int,
97
        default=None,
98
        metavar="N",
99
100
        help="Number of examples in few-shot context",
    )
101
102
    parser.add_argument(
        "--batch_size",
Baber Abbasi's avatar
Baber Abbasi committed
103
        "-b",
104
105
106
107
108
        type=str,
        default=1,
        metavar="auto|auto:N|N",
        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
    )
lintangsutawika's avatar
lintangsutawika committed
109
110
111
112
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
113
114
        metavar="N",
        help="Maximal batch size to try with --batch_size auto.",
lintangsutawika's avatar
lintangsutawika committed
115
    )
116
117
118
119
    parser.add_argument(
        "--device",
        type=str,
        default=None,
120
        help="Device to use (e.g. cuda, cuda:0, cpu).",
121
122
123
    )
    parser.add_argument(
        "--output_path",
Baber Abbasi's avatar
Baber Abbasi committed
124
        "-o",
125
126
        default=None,
        type=str,
127
        metavar="DIR|DIR/file.json",
Niccolò Ajroldi's avatar
Niccolò Ajroldi committed
128
        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
129
    )
lintangsutawika's avatar
lintangsutawika committed
130
131
    parser.add_argument(
        "--limit",
Baber Abbasi's avatar
Baber Abbasi committed
132
        "-L",
lintangsutawika's avatar
lintangsutawika committed
133
134
        type=float,
        default=None,
135
        metavar="N|0<N<1",
lintangsutawika's avatar
lintangsutawika committed
136
137
138
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
139
140
141
142
143
144
145
146
    parser.add_argument(
        "--samples",
        "-E",
        default=None,
        type=str,
        metavar="/path/to/json",
        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
    )
147
148
    parser.add_argument(
        "--use_cache",
Baber Abbasi's avatar
Baber Abbasi committed
149
        "-c",
150
151
        type=str,
        default=None,
152
        metavar="DIR",
153
154
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
155
156
157
158
159
160
161
    parser.add_argument(
        "--cache_requests",
        type=str,
        default=None,
        choices=["true", "refresh", "delete"],
        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
    )
162
163
164
    parser.add_argument(
        "--check_integrity",
        action="store_true",
165
        help="Whether to run the relevant part of the test suite for the tasks.",
166
167
168
    )
    parser.add_argument(
        "--write_out",
Baber Abbasi's avatar
Baber Abbasi committed
169
        "-w",
170
171
        action="store_true",
        default=False,
172
        help="Prints the prompt for the first few documents.",
173
174
175
    )
    parser.add_argument(
        "--log_samples",
Baber Abbasi's avatar
Baber Abbasi committed
176
        "-s",
177
178
        action="store_true",
        default=False,
179
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
180
    )
KonradSzafer's avatar
KonradSzafer committed
181
182
183
184
185
186
187
188
    parser.add_argument(
        "--system_instruction",
        type=str,
        default=None,
        help="System instruction to be used in the prompt",
    )
    parser.add_argument(
        "--apply_chat_template",
189
190
191
        type=str,
        nargs="?",
        const=True,
KonradSzafer's avatar
KonradSzafer committed
192
        default=False,
193
194
195
196
197
198
        help=(
            "If True, apply chat template to the prompt. "
            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
            "To apply a specific template from the available list of templates, provide the template name as an argument. "
            "E.g. `--apply_chat_template template_name`"
        ),
KonradSzafer's avatar
KonradSzafer committed
199
200
201
202
203
204
205
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
        action="store_true",
        default=False,
        help="If True, uses the fewshot as a multi-turn conversation",
    )
206
207
208
209
210
211
    parser.add_argument(
        "--show_config",
        action="store_true",
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
212
213
214
215
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
216
        metavar="DIR",
217
218
        help="Additional path to include if there are external tasks to include.",
    )
219
220
    parser.add_argument(
        "--gen_kwargs",
Baber Abbasi's avatar
Baber Abbasi committed
221
        type=try_parse_json,
222
        default=None,
USVSN Sai Prashanth's avatar
USVSN Sai Prashanth committed
223
        help=(
Baber Abbasi's avatar
Baber Abbasi committed
224
225
            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
lintangsutawika's avatar
lintangsutawika committed
226
227
228
        ),
    )
    parser.add_argument(
lintangsutawika's avatar
lintangsutawika committed
229
        "--verbosity",
Baber Abbasi's avatar
Baber Abbasi committed
230
231
        "-v",
        type=str.upper,
Lintang Sutawika's avatar
Lintang Sutawika committed
232
        default=None,
233
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
Lintang Sutawika's avatar
Lintang Sutawika committed
234
        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
235
    )
236
237
    parser.add_argument(
        "--wandb_args",
238
        type=str,
239
240
241
        default="",
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
242
243
244
245
246
247
    parser.add_argument(
        "--wandb_config_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
    )
248
249
250
251
252
253
    parser.add_argument(
        "--hf_hub_log_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
    )
Baber Abbasi's avatar
Baber Abbasi committed
254
255
256
257
258
259
260
    parser.add_argument(
        "--predict_only",
        "-x",
        action="store_true",
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
261
    default_seed_string = "0,1234,1234,1234"
262
263
    parser.add_argument(
        "--seed",
264
265
        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
        default=default_seed_string,  # for backward compatibility
266
        help=(
267
268
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
Sadra Barikbin's avatar
Sadra Barikbin committed
269
            "respectively, or a single integer to set the same seed for all four.\n"
270
271
272
273
274
            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
            "Here numpy's seed is not set since the second value is `None`.\n"
            "E.g, `--seed 42` sets all four seeds to 42."
275
276
        ),
    )
277
278
    parser.add_argument(
        "--trust_remote_code",
279
        action="store_true",
280
281
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
Hojin Lee's avatar
Hojin Lee committed
282
283
284
285
286
    parser.add_argument(
        "--confirm_run_unsafe_code",
        action="store_true",
        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
    )
Baber Abbasi's avatar
Baber Abbasi committed
287
288
289
290
291
292
    parser.add_argument(
        "--metadata",
        type=json.loads,
        default=None,
        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
    )
293
294
295
296
297
    return parser


def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
    check_argument_types(parser)
Jason Phang's avatar
Jason Phang committed
298
299
    return parser.parse_args()

Fabrizio Milo's avatar
Fabrizio Milo committed
300

haileyschoelkopf's avatar
haileyschoelkopf committed
301
302
303
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
304
305
        parser = setup_parser()
        args = parse_eval_args(parser)
haileyschoelkopf's avatar
haileyschoelkopf committed
306

307
308
309
310
311
312
313
314
315
316
317
    # defer loading `lm_eval` submodules for faster CLI load
    from lm_eval import evaluator, utils
    from lm_eval.evaluator import request_caching_arg_to_dict
    from lm_eval.loggers import EvaluationTracker, WandbLogger
    from lm_eval.tasks import TaskManager
    from lm_eval.utils import (
        handle_non_serializable,
        make_table,
        simple_parse_args_string,
    )

318
    if args.wandb_args:
319
320
321
        wandb_args_dict = simple_parse_args_string(args.wandb_args)
        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)
        wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict)
322

Lintang Sutawika's avatar
Lintang Sutawika committed
323
324
    utils.setup_logging(args.verbosity)
    eval_logger = logging.getLogger(__name__)
haileyschoelkopf's avatar
haileyschoelkopf committed
325
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
Fabrizio Milo's avatar
Fabrizio Milo committed
326

327
    # update the evaluation tracker args with the output path and the HF token
328
329
330
331
    if args.output_path:
        args.hf_hub_log_args += f",output_path={args.output_path}"
    if os.environ.get("HF_TOKEN", None):
        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
332
333
334
    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)

Baber Abbasi's avatar
Baber Abbasi committed
335
336
337
    if args.predict_only:
        args.log_samples = True
    if (args.log_samples or args.predict_only) and not args.output_path:
338
339
340
        raise ValueError(
            "Specify --output_path if providing --log_samples or --predict_only"
        )
Baber Abbasi's avatar
Baber Abbasi committed
341

KonradSzafer's avatar
KonradSzafer committed
342
343
    if args.fewshot_as_multiturn and args.apply_chat_template is False:
        raise ValueError(
344
            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
KonradSzafer's avatar
KonradSzafer committed
345
346
        )

347
348
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
Baber Abbasi's avatar
Baber Abbasi committed
349
350
351
352
353
354
355
356
357
358
359
360
361
    metadata = (
        simple_parse_args_string(args.model_args)
        if isinstance(args.model_args, str)
        else args.model_args
        if isinstance(args.model_args, dict)
        else {}
    ) | (
        args.metadata
        if isinstance(args.metadata, dict)
        else simple_parse_args_string(args.metadata)
    )

    task_manager = TaskManager(include_path=args.include_path, metadata=metadata)
Fabrizio Milo's avatar
Fabrizio Milo committed
362

KonradSzafer's avatar
KonradSzafer committed
363
    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
364
365
366
367
        eval_logger.warning(
            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
        )

Leo Gao's avatar
Leo Gao committed
368
    if args.limit:
lintangsutawika's avatar
lintangsutawika committed
369
370
371
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
Fabrizio Milo's avatar
Fabrizio Milo committed
372
        )
373
374
375
376
377
378
379
380
    if args.samples:
        assert args.limit is None, (
            "If --samples is not None, then --limit must be None."
        )
        if (samples := Path(args.samples)).is_file():
            args.samples = json.loads(samples.read_text())
        else:
            args.samples = json.loads(args.samples)
lintangsutawika's avatar
lintangsutawika committed
381

382
    if args.tasks is None:
383
384
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
385
    elif args.tasks == "list":
386
387
388
389
390
391
392
393
394
395
        print(task_manager.list_all_tasks())
        sys.exit()
    elif args.tasks == "list_groups":
        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
        sys.exit()
    elif args.tasks == "list_tags":
        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
        sys.exit()
    elif args.tasks == "list_subtasks":
        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
Lintang Sutawika's avatar
Lintang Sutawika committed
396
        sys.exit()
Jason Phang's avatar
Jason Phang committed
397
    else:
398
399
        if os.path.isdir(args.tasks):
            import glob
400
401

            task_names = []
402
403
            yaml_path = os.path.join(args.tasks, "*.yaml")
            for yaml_file in glob.glob(yaml_path):
lintangsutawika's avatar
lintangsutawika committed
404
                config = utils.load_yaml_config(yaml_file)
405
406
                task_names.append(config)
        else:
407
408
409
            task_list = args.tasks.split(",")
            task_names = task_manager.match_tasks(task_list)
            for task in [task for task in task_list if task not in task_names]:
410
                if os.path.isfile(task):
lintangsutawika's avatar
lintangsutawika committed
411
                    config = utils.load_yaml_config(task)
412
                    task_names.append(config)
413
            task_missing = [
414
                task for task in task_list if task not in task_names and "*" not in task
415
            ]  # we don't want errors if a wildcard ("*") task name was used
lintangsutawika's avatar
lintangsutawika committed
416

baberabb's avatar
baberabb committed
417
418
419
420
            if task_missing:
                missing = ", ".join(task_missing)
                eval_logger.error(
                    f"Tasks were not found: {missing}\n"
lintangsutawika's avatar
lintangsutawika committed
421
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
baberabb's avatar
baberabb committed
422
423
                )
                raise ValueError(
424
                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
baberabb's avatar
baberabb committed
425
                )
lintangsutawika's avatar
lintangsutawika committed
426

427
428
    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
429
430
        eval_logger.info(
            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
431
        )
432
433
434
435
        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
Baber Abbasi's avatar
Baber Abbasi committed
436
        from packaging.version import parse as vparse
437

Baber Abbasi's avatar
Baber Abbasi committed
438
439
        if vparse(datasets.__version__) < vparse("4.0.0"):
            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
440
441
442
443
        else:
            eval_logger.warning(
                "trust_remote_code and datasets scripts are no longer supported on datasets>=4.0.0. Skipping. If your task still requires this, please downgrade to datasets==3.6.0 or earlier."
            )
444

445
446
447
448
        if isinstance(args.model_args, dict):
            args.model_args["trust_remote_code"] = True
        else:
            args.model_args = args.model_args + ",trust_remote_code=True"
449
450
451
452
    (
        eval_logger.info(f"Selected Tasks: {task_names}")
        if eval_logger.getEffectiveLevel() >= logging.INFO
        else print(f"Selected Tasks: {task_names}")
Baber Abbasi's avatar
Baber Abbasi committed
453
    )
454

455
456
457
458
    request_caching_args = request_caching_arg_to_dict(
        cache_requests=args.cache_requests
    )

459
460
461
462
463
464
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
465
        max_batch_size=args.max_batch_size,
466
        device=args.device,
haileyschoelkopf's avatar
haileyschoelkopf committed
467
        use_cache=args.use_cache,
468
        limit=args.limit,
469
        samples=args.samples,
470
        check_integrity=args.check_integrity,
471
        write_out=args.write_out,
472
        log_samples=args.log_samples,
KonradSzafer's avatar
KonradSzafer committed
473
474
475
476
        evaluation_tracker=evaluation_tracker,
        system_instruction=args.system_instruction,
        apply_chat_template=args.apply_chat_template,
        fewshot_as_multiturn=args.fewshot_as_multiturn,
lintangsutawika's avatar
lintangsutawika committed
477
        gen_kwargs=args.gen_kwargs,
478
        task_manager=task_manager,
Baber Abbasi's avatar
Baber Abbasi committed
479
        predict_only=args.predict_only,
480
481
482
        random_seed=args.seed[0],
        numpy_random_seed=args.seed[1],
        torch_random_seed=args.seed[2],
483
        fewshot_random_seed=args.seed[3],
Hojin Lee's avatar
Hojin Lee committed
484
        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
Baber Abbasi's avatar
Baber Abbasi committed
485
        metadata=metadata,
486
        **request_caching_args,
487
    )
488

489
    if results is not None:
490
491
        if args.log_samples:
            samples = results.pop("samples")
492
        dumped = json.dumps(
493
            results, indent=2, default=handle_non_serializable, ensure_ascii=False
494
        )
495
496
        if args.show_config:
            print(dumped)
497

498
499
        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

500
501
502
503
504
505
506
507
508
509
        # Add W&B logging
        if args.wandb_args:
            try:
                wandb_logger.post_init(results)
                wandb_logger.log_eval_result()
                if args.log_samples:
                    wandb_logger.log_eval_samples(samples)
            except Exception as e:
                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

KonradSzafer's avatar
KonradSzafer committed
510
511
512
        evaluation_tracker.save_results_aggregated(
            results=results, samples=samples if args.log_samples else None
        )
513
514
515
516
517
518

        if args.log_samples:
            for task_name, config in results["configs"].items():
                evaluation_tracker.save_results_samples(
                    task_name=task_name, samples=samples[task_name]
                )
lintangsutawika's avatar
lintangsutawika committed
519

520
521
522
523
524
525
        if (
            evaluation_tracker.push_results_to_hub
            or evaluation_tracker.push_samples_to_hub
        ):
            evaluation_tracker.recreate_metadata_card()

526
        print(
527
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
528
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
529
        )
530
        print(make_table(results))
lintangsutawika's avatar
lintangsutawika committed
531
        if "groups" in results:
532
            print(make_table(results, "groups"))
Jason Phang's avatar
lib  
Jason Phang committed
533

534
535
536
537
        if args.wandb_args:
            # Tear down wandb run once all the logging is done.
            wandb_logger.run.finish()

538

Jason Phang's avatar
Jason Phang committed
539
if __name__ == "__main__":
haileyschoelkopf's avatar
haileyschoelkopf committed
540
    cli_evaluate()