__main__.py 19.7 KB
Newer Older
1
2
3
import argparse
import json
import logging
lintangsutawika's avatar
lintangsutawika committed
4
import os
5
import sys
6
from functools import partial
7
from pathlib import Path
haileyschoelkopf's avatar
haileyschoelkopf committed
8
from typing import Union
Leo Gao's avatar
Leo Gao committed
9

10
11
import lm_eval.tasks

Baber Abbasi's avatar
Baber Abbasi committed
12
13
14
15
16
17
18
19
20
21
22
23

def try_parse_json(value: str) -> Union[str, dict, None]:
    if value is None:
        return None
    try:
        return json.loads(value)
    except json.JSONDecodeError:
        if "{" in value:
            raise argparse.ArgumentTypeError(
                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
            )
        return value
Fabrizio Milo's avatar
Fabrizio Milo committed
24

25

26
27
28
def _int_or_none_list_arg_type(
    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
    def parse_value(item):
        item = item.strip().lower()
        if item == "none":
            return None
        try:
            return int(item)
        except ValueError:
            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")

    items = [parse_value(v) for v in value.split(split_char)]
    num_items = len(items)

    if num_items == 1:
        # Makes downstream handling the same for single and multiple values
        items = items * max_len
44
    elif num_items < min_len or num_items > max_len:
45
46
47
        raise argparse.ArgumentTypeError(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
        )
48
49
50
51
52
53
54
55
56
    elif num_items != max_len:
        logging.warning(
            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
            "Missing values will be filled with defaults."
        )
        default_items = [parse_value(v) for v in defaults.split(split_char)]
        items.extend(
            default_items[num_items:]
        )  # extend items list with missing defaults
57
58
59
60

    return items


61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def check_argument_types(parser: argparse.ArgumentParser):
    """
    Check to make sure all CLI args are typed, raises error if not
    """
    for action in parser._actions:
        if action.dest != "help" and not action.const:
            if action.type is None:
                raise ValueError(
                    f"Argument '{action.dest}' doesn't have a type specified."
                )
            else:
                continue


def setup_parser() -> argparse.ArgumentParser:
lintangsutawika's avatar
lintangsutawika committed
76
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
77
78
79
    parser.add_argument(
        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
    )
lintangsutawika's avatar
lintangsutawika committed
80
81
    parser.add_argument(
        "--tasks",
Baber Abbasi's avatar
Baber Abbasi committed
82
        "-t",
lintangsutawika's avatar
lintangsutawika committed
83
        default=None,
84
        type=str,
85
        metavar="task1,task2",
86
        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
lintangsutawika's avatar
lintangsutawika committed
87
    )
88
89
    parser.add_argument(
        "--model_args",
Baber Abbasi's avatar
Baber Abbasi committed
90
        "-a",
91
        default="",
Baber Abbasi's avatar
Baber Abbasi committed
92
93
        type=try_parse_json,
        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
94
    )
lintangsutawika's avatar
lintangsutawika committed
95
    parser.add_argument(
96
        "--num_fewshot",
Baber Abbasi's avatar
Baber Abbasi committed
97
        "-f",
98
        type=int,
99
        default=None,
100
        metavar="N",
101
102
        help="Number of examples in few-shot context",
    )
103
104
    parser.add_argument(
        "--batch_size",
Baber Abbasi's avatar
Baber Abbasi committed
105
        "-b",
106
107
108
109
110
        type=str,
        default=1,
        metavar="auto|auto:N|N",
        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
    )
lintangsutawika's avatar
lintangsutawika committed
111
112
113
114
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
115
116
        metavar="N",
        help="Maximal batch size to try with --batch_size auto.",
lintangsutawika's avatar
lintangsutawika committed
117
    )
118
119
120
121
    parser.add_argument(
        "--device",
        type=str,
        default=None,
122
        help="Device to use (e.g. cuda, cuda:0, cpu).",
123
124
125
    )
    parser.add_argument(
        "--output_path",
Baber Abbasi's avatar
Baber Abbasi committed
126
        "-o",
127
128
        default=None,
        type=str,
129
        metavar="DIR|DIR/file.json",
Niccolò Ajroldi's avatar
Niccolò Ajroldi committed
130
        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
131
    )
lintangsutawika's avatar
lintangsutawika committed
132
133
    parser.add_argument(
        "--limit",
Baber Abbasi's avatar
Baber Abbasi committed
134
        "-L",
lintangsutawika's avatar
lintangsutawika committed
135
136
        type=float,
        default=None,
137
        metavar="N|0<N<1",
lintangsutawika's avatar
lintangsutawika committed
138
139
140
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
141
142
143
144
145
146
147
148
    parser.add_argument(
        "--samples",
        "-E",
        default=None,
        type=str,
        metavar="/path/to/json",
        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
    )
149
150
    parser.add_argument(
        "--use_cache",
Baber Abbasi's avatar
Baber Abbasi committed
151
        "-c",
152
153
        type=str,
        default=None,
154
        metavar="DIR",
155
156
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
157
158
159
160
161
162
163
    parser.add_argument(
        "--cache_requests",
        type=str,
        default=None,
        choices=["true", "refresh", "delete"],
        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
    )
164
165
166
    parser.add_argument(
        "--check_integrity",
        action="store_true",
167
        help="Whether to run the relevant part of the test suite for the tasks.",
168
169
170
    )
    parser.add_argument(
        "--write_out",
Baber Abbasi's avatar
Baber Abbasi committed
171
        "-w",
172
173
        action="store_true",
        default=False,
174
        help="Prints the prompt for the first few documents.",
175
176
177
    )
    parser.add_argument(
        "--log_samples",
Baber Abbasi's avatar
Baber Abbasi committed
178
        "-s",
179
180
        action="store_true",
        default=False,
181
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
182
    )
KonradSzafer's avatar
KonradSzafer committed
183
184
185
186
187
188
189
190
    parser.add_argument(
        "--system_instruction",
        type=str,
        default=None,
        help="System instruction to be used in the prompt",
    )
    parser.add_argument(
        "--apply_chat_template",
191
192
193
        type=str,
        nargs="?",
        const=True,
KonradSzafer's avatar
KonradSzafer committed
194
        default=False,
195
196
197
198
199
200
        help=(
            "If True, apply chat template to the prompt. "
            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
            "To apply a specific template from the available list of templates, provide the template name as an argument. "
            "E.g. `--apply_chat_template template_name`"
        ),
KonradSzafer's avatar
KonradSzafer committed
201
202
203
204
205
206
207
    )
    parser.add_argument(
        "--fewshot_as_multiturn",
        action="store_true",
        default=False,
        help="If True, uses the fewshot as a multi-turn conversation",
    )
208
209
210
211
212
213
    parser.add_argument(
        "--show_config",
        action="store_true",
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
214
215
216
217
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
218
        metavar="DIR",
219
220
        help="Additional path to include if there are external tasks to include.",
    )
221
222
    parser.add_argument(
        "--gen_kwargs",
Baber Abbasi's avatar
Baber Abbasi committed
223
        type=try_parse_json,
224
        default=None,
USVSN Sai Prashanth's avatar
USVSN Sai Prashanth committed
225
        help=(
Baber Abbasi's avatar
Baber Abbasi committed
226
227
            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
lintangsutawika's avatar
lintangsutawika committed
228
229
230
        ),
    )
    parser.add_argument(
lintangsutawika's avatar
lintangsutawika committed
231
        "--verbosity",
Baber Abbasi's avatar
Baber Abbasi committed
232
233
        "-v",
        type=str.upper,
Lintang Sutawika's avatar
Lintang Sutawika committed
234
        default=None,
235
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
Lintang Sutawika's avatar
Lintang Sutawika committed
236
        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
237
    )
238
239
    parser.add_argument(
        "--wandb_args",
240
        type=str,
241
242
243
        default="",
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
244
245
246
247
248
249
    parser.add_argument(
        "--wandb_config_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
    )
250
251
252
253
254
255
    parser.add_argument(
        "--hf_hub_log_args",
        type=str,
        default="",
        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
    )
Baber Abbasi's avatar
Baber Abbasi committed
256
257
258
259
260
261
262
    parser.add_argument(
        "--predict_only",
        "-x",
        action="store_true",
        default=False,
        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
    )
263
    default_seed_string = "0,1234,1234,1234"
264
265
    parser.add_argument(
        "--seed",
266
267
        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
        default=default_seed_string,  # for backward compatibility
268
        help=(
269
270
            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
Sadra Barikbin's avatar
Sadra Barikbin committed
271
            "respectively, or a single integer to set the same seed for all four.\n"
272
273
274
275
276
            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
            "(for backward compatibility).\n"
            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
            "Here numpy's seed is not set since the second value is `None`.\n"
            "E.g, `--seed 42` sets all four seeds to 42."
277
278
        ),
    )
279
280
    parser.add_argument(
        "--trust_remote_code",
281
        action="store_true",
282
283
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )
Hojin Lee's avatar
Hojin Lee committed
284
285
286
287
288
    parser.add_argument(
        "--confirm_run_unsafe_code",
        action="store_true",
        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
    )
Baber Abbasi's avatar
Baber Abbasi committed
289
290
291
292
293
294
    parser.add_argument(
        "--metadata",
        type=json.loads,
        default=None,
        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
    )
295
296
297
298
299
    return parser


def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
    check_argument_types(parser)
Jason Phang's avatar
Jason Phang committed
300
301
    return parser.parse_args()

Fabrizio Milo's avatar
Fabrizio Milo committed
302

haileyschoelkopf's avatar
haileyschoelkopf committed
303
304
305
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
306
307
        parser = setup_parser()
        args = parse_eval_args(parser)
haileyschoelkopf's avatar
haileyschoelkopf committed
308

309
310
311
312
313
314
315
316
317
318
319
    # defer loading `lm_eval` submodules for faster CLI load
    from lm_eval import evaluator, utils
    from lm_eval.evaluator import request_caching_arg_to_dict
    from lm_eval.loggers import EvaluationTracker, WandbLogger
    from lm_eval.tasks import TaskManager
    from lm_eval.utils import (
        handle_non_serializable,
        make_table,
        simple_parse_args_string,
    )

320
    if args.wandb_args:
321
322
323
        wandb_args_dict = simple_parse_args_string(args.wandb_args)
        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)
        wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict)
324

Lintang Sutawika's avatar
Lintang Sutawika committed
325
326
    utils.setup_logging(args.verbosity)
    eval_logger = logging.getLogger(__name__)
haileyschoelkopf's avatar
haileyschoelkopf committed
327
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
Fabrizio Milo's avatar
Fabrizio Milo committed
328

329
    # update the evaluation tracker args with the output path and the HF token
330
331
332
333
    if args.output_path:
        args.hf_hub_log_args += f",output_path={args.output_path}"
    if os.environ.get("HF_TOKEN", None):
        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
334
335
336
    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)

Baber Abbasi's avatar
Baber Abbasi committed
337
338
339
    if args.predict_only:
        args.log_samples = True
    if (args.log_samples or args.predict_only) and not args.output_path:
340
341
342
        raise ValueError(
            "Specify --output_path if providing --log_samples or --predict_only"
        )
Baber Abbasi's avatar
Baber Abbasi committed
343

KonradSzafer's avatar
KonradSzafer committed
344
345
    if args.fewshot_as_multiturn and args.apply_chat_template is False:
        raise ValueError(
346
            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
KonradSzafer's avatar
KonradSzafer committed
347
348
        )

349
350
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
Baber Abbasi's avatar
Baber Abbasi committed
351
352
353
354
355
356
357
358
359
360
361
362
363
    metadata = (
        simple_parse_args_string(args.model_args)
        if isinstance(args.model_args, str)
        else args.model_args
        if isinstance(args.model_args, dict)
        else {}
    ) | (
        args.metadata
        if isinstance(args.metadata, dict)
        else simple_parse_args_string(args.metadata)
    )

    task_manager = TaskManager(include_path=args.include_path, metadata=metadata)
Fabrizio Milo's avatar
Fabrizio Milo committed
364

KonradSzafer's avatar
KonradSzafer committed
365
    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
366
367
368
369
        eval_logger.warning(
            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
        )

Leo Gao's avatar
Leo Gao committed
370
    if args.limit:
lintangsutawika's avatar
lintangsutawika committed
371
372
373
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
Fabrizio Milo's avatar
Fabrizio Milo committed
374
        )
375
376
377
378
379
380
381
382
    if args.samples:
        assert args.limit is None, (
            "If --samples is not None, then --limit must be None."
        )
        if (samples := Path(args.samples)).is_file():
            args.samples = json.loads(samples.read_text())
        else:
            args.samples = json.loads(args.samples)
lintangsutawika's avatar
lintangsutawika committed
383

384
    if args.tasks is None:
385
386
        eval_logger.error("Need to specify task to evaluate.")
        sys.exit()
387
    elif args.tasks == "list":
388
389
390
391
392
393
394
395
396
397
        print(task_manager.list_all_tasks())
        sys.exit()
    elif args.tasks == "list_groups":
        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
        sys.exit()
    elif args.tasks == "list_tags":
        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
        sys.exit()
    elif args.tasks == "list_subtasks":
        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
Lintang Sutawika's avatar
Lintang Sutawika committed
398
        sys.exit()
Jason Phang's avatar
Jason Phang committed
399
    else:
400
401
        if os.path.isdir(args.tasks):
            import glob
402
403

            task_names = []
404
405
            yaml_path = os.path.join(args.tasks, "*.yaml")
            for yaml_file in glob.glob(yaml_path):
406
                config = lm_eval.tasks.load_yaml_config(yaml_file)
407
408
                task_names.append(config)
        else:
409
410
411
            task_list = args.tasks.split(",")
            task_names = task_manager.match_tasks(task_list)
            for task in [task for task in task_list if task not in task_names]:
412
                if os.path.isfile(task):
413
                    config = lm_eval.tasks.load_yaml_config(task)
414
                    task_names.append(config)
415
            task_missing = [
416
                task for task in task_list if task not in task_names and "*" not in task
417
            ]  # we don't want errors if a wildcard ("*") task name was used
lintangsutawika's avatar
lintangsutawika committed
418

baberabb's avatar
baberabb committed
419
420
421
422
            if task_missing:
                missing = ", ".join(task_missing)
                eval_logger.error(
                    f"Tasks were not found: {missing}\n"
lintangsutawika's avatar
lintangsutawika committed
423
                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
baberabb's avatar
baberabb committed
424
425
                )
                raise ValueError(
426
                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
baberabb's avatar
baberabb committed
427
                )
lintangsutawika's avatar
lintangsutawika committed
428

429
430
    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
    if args.trust_remote_code:
431
432
        eval_logger.info(
            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
433
        )
434
435
436
437
        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
Baber Abbasi's avatar
Baber Abbasi committed
438
        from packaging.version import parse as vparse
439

Baber Abbasi's avatar
Baber Abbasi committed
440
441
        if vparse(datasets.__version__) < vparse("4.0.0"):
            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
442

443
444
445
446
        if isinstance(args.model_args, dict):
            args.model_args["trust_remote_code"] = True
        else:
            args.model_args = args.model_args + ",trust_remote_code=True"
447
448
449
450
    (
        eval_logger.info(f"Selected Tasks: {task_names}")
        if eval_logger.getEffectiveLevel() >= logging.INFO
        else print(f"Selected Tasks: {task_names}")
Baber Abbasi's avatar
Baber Abbasi committed
451
    )
452

453
454
455
456
    request_caching_args = request_caching_arg_to_dict(
        cache_requests=args.cache_requests
    )

457
458
459
460
461
462
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
463
        max_batch_size=args.max_batch_size,
464
        device=args.device,
haileyschoelkopf's avatar
haileyschoelkopf committed
465
        use_cache=args.use_cache,
466
        limit=args.limit,
467
        samples=args.samples,
468
        check_integrity=args.check_integrity,
469
        write_out=args.write_out,
470
        log_samples=args.log_samples,
KonradSzafer's avatar
KonradSzafer committed
471
472
473
474
        evaluation_tracker=evaluation_tracker,
        system_instruction=args.system_instruction,
        apply_chat_template=args.apply_chat_template,
        fewshot_as_multiturn=args.fewshot_as_multiturn,
lintangsutawika's avatar
lintangsutawika committed
475
        gen_kwargs=args.gen_kwargs,
476
        task_manager=task_manager,
Baber Abbasi's avatar
Baber Abbasi committed
477
        predict_only=args.predict_only,
478
479
480
        random_seed=args.seed[0],
        numpy_random_seed=args.seed[1],
        torch_random_seed=args.seed[2],
481
        fewshot_random_seed=args.seed[3],
Hojin Lee's avatar
Hojin Lee committed
482
        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
Baber Abbasi's avatar
Baber Abbasi committed
483
        metadata=metadata,
484
        **request_caching_args,
485
    )
486

487
    if results is not None:
488
489
        if args.log_samples:
            samples = results.pop("samples")
490
        dumped = json.dumps(
491
            results, indent=2, default=handle_non_serializable, ensure_ascii=False
492
        )
493
494
        if args.show_config:
            print(dumped)
495

496
497
        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

498
499
500
501
502
503
504
505
506
507
        # Add W&B logging
        if args.wandb_args:
            try:
                wandb_logger.post_init(results)
                wandb_logger.log_eval_result()
                if args.log_samples:
                    wandb_logger.log_eval_samples(samples)
            except Exception as e:
                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

KonradSzafer's avatar
KonradSzafer committed
508
509
510
        evaluation_tracker.save_results_aggregated(
            results=results, samples=samples if args.log_samples else None
        )
511
512
513
514
515
516

        if args.log_samples:
            for task_name, config in results["configs"].items():
                evaluation_tracker.save_results_samples(
                    task_name=task_name, samples=samples[task_name]
                )
lintangsutawika's avatar
lintangsutawika committed
517

518
519
520
521
522
523
        if (
            evaluation_tracker.push_results_to_hub
            or evaluation_tracker.push_samples_to_hub
        ):
            evaluation_tracker.recreate_metadata_card()

524
        print(
525
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
526
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
527
        )
528
        print(make_table(results))
lintangsutawika's avatar
lintangsutawika committed
529
        if "groups" in results:
530
            print(make_table(results, "groups"))
Jason Phang's avatar
lib  
Jason Phang committed
531

532
533
534
535
        if args.wandb_args:
            # Tear down wandb run once all the logging is done.
            wandb_logger.run.finish()

536

Jason Phang's avatar
Jason Phang committed
537
if __name__ == "__main__":
haileyschoelkopf's avatar
haileyschoelkopf committed
538
    cli_evaluate()