"configs/datasets/CLUE_C3/CLUE_C3_gen.py" did not exist on "7d346000bb8f1f7611f88dc8e003bdf8c9ae3ece"
run.py 16.7 KB
Newer Older
Baber's avatar
Baber committed
1
2
3
4
import argparse
import json
import logging
import os
Baber's avatar
cleanup  
Baber committed
5
import textwrap
Baber's avatar
Baber committed
6
7
from functools import partial

Baber's avatar
Baber committed
8
9
10
11
12
13
from lm_eval._cli import SubCommand
from lm_eval._cli.utils import (
    _int_or_none_list_arg_type,
    request_caching_arg_to_dict,
    try_parse_json,
)
Baber's avatar
Baber committed
14
15


Baber's avatar
Baber committed
16
class Run(SubCommand):
Baber's avatar
Baber committed
17
18
19
20
    """Command for running language model evaluation."""

    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
        super().__init__(*args, **kwargs)
Baber's avatar
cleanup  
Baber committed
21
        self._parser = subparsers.add_parser(
Baber's avatar
Baber committed
22
            "run",
Baber's avatar
cleanup  
Baber committed
23
            help="Run the evaluation harness on specified tasks",
Baber's avatar
Baber committed
24
            description="Evaluate language models on various benchmarks and tasks.",
Baber's avatar
cleanup  
Baber committed
25
            usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]",
Baber's avatar
cleanup  
Baber committed
26
27
28
29
            epilog=textwrap.dedent("""
                examples:
                  # Basic evaluation with HuggingFace model
                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
Baber's avatar
Baber committed
30

Baber's avatar
cleanup  
Baber committed
31
32
                  # Evaluate on multiple tasks with few-shot examples
                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
Baber's avatar
Baber committed
33

Baber's avatar
cleanup  
Baber committed
34
35
                  # Evaluation with custom generation parameters
                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
Baber's avatar
Baber committed
36

Baber's avatar
cleanup  
Baber committed
37
38
                  # Use configuration file
                  $ lm-eval run --config my_config.yaml --tasks mmlu
Baber's avatar
Baber committed
39

Baber's avatar
cleanup  
Baber committed
40
41
                For more information, see: https://github.com/EleutherAI/lm-evaluation-harness
            """),
Baber's avatar
Baber committed
42
43
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
Baber's avatar
cleanup  
Baber committed
44
        self._add_args()
Baber's avatar
cleanup  
Baber committed
45
        self._parser.set_defaults(func=lambda arg: self._parser.print_help())
Baber's avatar
Baber committed
46

Baber's avatar
cleanup  
Baber committed
47
48
49
    def _add_args(self) -> None:
        self._parser = self._parser
        self._parser.add_argument(
Baber's avatar
Baber committed
50
51
52
53
54
55
56
            "--config",
            "-C",
            default=None,
            type=str,
            metavar="DIR/file.yaml",
            help="Path to config with all arguments for `lm-eval`",
        )
Baber's avatar
cleanup  
Baber committed
57
        self._parser.add_argument(
Baber's avatar
Baber committed
58
59
60
61
            "--model",
            "-m",
            type=str,
            default="hf",
Baber's avatar
Baber committed
62
            help="Name of model. Default 'hf'",
Baber's avatar
Baber committed
63
        )
Baber's avatar
cleanup  
Baber committed
64
        self._parser.add_argument(
Baber's avatar
Baber committed
65
66
67
68
69
70
71
            "--tasks",
            "-t",
            default=None,
            type=str,
            metavar="task1,task2",
            help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
        )
Baber's avatar
cleanup  
Baber committed
72
        self._parser.add_argument(
Baber's avatar
Baber committed
73
74
            "--model_args",
            "-a",
Baber's avatar
Baber committed
75
            default=None,
Baber's avatar
Baber committed
76
77
78
            type=try_parse_json,
            help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
        )
Baber's avatar
cleanup  
Baber committed
79
        self._parser.add_argument(
Baber's avatar
Baber committed
80
81
82
83
84
85
86
            "--num_fewshot",
            "-f",
            type=int,
            default=None,
            metavar="N",
            help="Number of examples in few-shot context",
        )
Baber's avatar
cleanup  
Baber committed
87
        self._parser.add_argument(
Baber's avatar
Baber committed
88
89
90
            "--batch_size",
            "-b",
            type=str,
Baber's avatar
Baber committed
91
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
92
            metavar="auto|auto:N|N",
Baber's avatar
Baber committed
93
            help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
Baber's avatar
Baber committed
94
        )
Baber's avatar
cleanup  
Baber committed
95
        self._parser.add_argument(
Baber's avatar
Baber committed
96
97
98
99
100
101
            "--max_batch_size",
            type=int,
            default=None,
            metavar="N",
            help="Maximal batch size to try with --batch_size auto.",
        )
Baber's avatar
cleanup  
Baber committed
102
        self._parser.add_argument(
Baber's avatar
Baber committed
103
104
105
            "--device",
            type=str,
            default=None,
Baber's avatar
Baber committed
106
            help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
Baber's avatar
Baber committed
107
        )
Baber's avatar
cleanup  
Baber committed
108
        self._parser.add_argument(
Baber's avatar
Baber committed
109
110
111
112
113
114
115
            "--output_path",
            "-o",
            default=None,
            type=str,
            metavar="DIR|DIR/file.json",
            help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
        )
Baber's avatar
cleanup  
Baber committed
116
        self._parser.add_argument(
Baber's avatar
Baber committed
117
118
119
120
121
122
123
124
            "--limit",
            "-L",
            type=float,
            default=None,
            metavar="N|0<N<1",
            help="Limit the number of examples per task. "
            "If <1, limit is a percentage of the total number of examples.",
        )
Baber's avatar
cleanup  
Baber committed
125
        self._parser.add_argument(
Baber's avatar
Baber committed
126
127
128
            "--samples",
            "-E",
            default=None,
Baber's avatar
Baber committed
129
            type=try_parse_json,
Baber's avatar
Baber committed
130
131
132
            metavar="/path/to/json",
            help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
        )
Baber's avatar
cleanup  
Baber committed
133
        self._parser.add_argument(
Baber's avatar
Baber committed
134
135
136
137
138
139
140
            "--use_cache",
            "-c",
            type=str,
            default=None,
            metavar="DIR",
            help="A path to a sqlite db file for caching model responses. `None` if not caching.",
        )
Baber's avatar
cleanup  
Baber committed
141
        self._parser.add_argument(
Baber's avatar
Baber committed
142
            "--cache_requests",
Baber's avatar
Baber committed
143
            type=request_caching_arg_to_dict,
Baber's avatar
Baber committed
144
145
146
147
            default=None,
            choices=["true", "refresh", "delete"],
            help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
        )
Baber's avatar
cleanup  
Baber committed
148
        self._parser.add_argument(
Baber's avatar
Baber committed
149
150
            "--check_integrity",
            action="store_true",
Baber's avatar
Baber committed
151
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
152
153
            help="Whether to run the relevant part of the test suite for the tasks.",
        )
Baber's avatar
cleanup  
Baber committed
154
        self._parser.add_argument(
Baber's avatar
Baber committed
155
156
157
            "--write_out",
            "-w",
            action="store_true",
Baber's avatar
Baber committed
158
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
159
160
            help="Prints the prompt for the first few documents.",
        )
Baber's avatar
cleanup  
Baber committed
161
        self._parser.add_argument(
Baber's avatar
Baber committed
162
163
164
            "--log_samples",
            "-s",
            action="store_true",
Baber's avatar
Baber committed
165
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
166
167
            help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
        )
Baber's avatar
cleanup  
Baber committed
168
        self._parser.add_argument(
Baber's avatar
Baber committed
169
170
171
172
173
            "--system_instruction",
            type=str,
            default=None,
            help="System instruction to be used in the prompt",
        )
Baber's avatar
cleanup  
Baber committed
174
        self._parser.add_argument(
Baber's avatar
Baber committed
175
176
177
178
            "--apply_chat_template",
            type=str,
            nargs="?",
            const=True,
Baber's avatar
Baber committed
179
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
180
181
182
183
184
185
186
            help=(
                "If True, apply chat template to the prompt. "
                "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
                "To apply a specific template from the available list of templates, provide the template name as an argument. "
                "E.g. `--apply_chat_template template_name`"
            ),
        )
Baber's avatar
cleanup  
Baber committed
187
        self._parser.add_argument(
Baber's avatar
Baber committed
188
189
            "--fewshot_as_multiturn",
            action="store_true",
Baber's avatar
Baber committed
190
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
191
192
            help="If True, uses the fewshot as a multi-turn conversation",
        )
Baber's avatar
cleanup  
Baber committed
193
        self._parser.add_argument(
Baber's avatar
Baber committed
194
195
            "--show_config",
            action="store_true",
Baber's avatar
Baber committed
196
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
197
198
            help="If True, shows the the full config of all tasks at the end of the evaluation.",
        )
Baber's avatar
cleanup  
Baber committed
199
        self._parser.add_argument(
Baber's avatar
Baber committed
200
201
202
203
204
205
            "--include_path",
            type=str,
            default=None,
            metavar="DIR",
            help="Additional path to include if there are external tasks to include.",
        )
Baber's avatar
cleanup  
Baber committed
206
        self._parser.add_argument(
Baber's avatar
Baber committed
207
208
209
210
211
            "--gen_kwargs",
            type=try_parse_json,
            default=None,
            help=(
                "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
Baber's avatar
Baber committed
212
                """ e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
Baber's avatar
Baber committed
213
214
            ),
        )
Baber's avatar
cleanup  
Baber committed
215
        self._parser.add_argument(
Baber's avatar
Baber committed
216
217
218
219
220
221
222
            "--verbosity",
            "-v",
            type=str.upper,
            default=None,
            metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
            help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
        )
Baber's avatar
cleanup  
Baber committed
223
        self._parser.add_argument(
Baber's avatar
Baber committed
224
225
            "--wandb_args",
            type=str,
Baber's avatar
Baber committed
226
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
227
228
            help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
        )
Baber's avatar
cleanup  
Baber committed
229
        self._parser.add_argument(
Baber's avatar
Baber committed
230
231
            "--wandb_config_args",
            type=str,
Baber's avatar
Baber committed
232
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
233
234
            help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
        )
Baber's avatar
cleanup  
Baber committed
235
        self._parser.add_argument(
Baber's avatar
Baber committed
236
237
            "--hf_hub_log_args",
            type=str,
Baber's avatar
Baber committed
238
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
239
240
            help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
        )
Baber's avatar
cleanup  
Baber committed
241
        self._parser.add_argument(
Baber's avatar
Baber committed
242
243
244
            "--predict_only",
            "-x",
            action="store_true",
Baber's avatar
Baber committed
245
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
246
247
248
            help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
        )
        default_seed_string = "0,1234,1234,1234"
Baber's avatar
cleanup  
Baber committed
249
        self._parser.add_argument(
Baber's avatar
Baber committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
            "--seed",
            type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
            default=default_seed_string,  # for backward compatibility
            help=(
                "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
                "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
                "respectively, or a single integer to set the same seed for all four.\n"
                f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
                "(for backward compatibility).\n"
                "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
                "Here numpy's seed is not set since the second value is `None`.\n"
                "E.g, `--seed 42` sets all four seeds to 42."
            ),
        )
Baber's avatar
cleanup  
Baber committed
264
        self._parser.add_argument(
Baber's avatar
Baber committed
265
266
            "--trust_remote_code",
            action="store_true",
Baber's avatar
Baber committed
267
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
268
269
            help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
        )
Baber's avatar
cleanup  
Baber committed
270
        self._parser.add_argument(
Baber's avatar
Baber committed
271
272
            "--confirm_run_unsafe_code",
            action="store_true",
Baber's avatar
Baber committed
273
            default=argparse.SUPPRESS,
Baber's avatar
Baber committed
274
275
            help="Confirm that you understand the risks of running unsafe code for tasks that require it",
        )
Baber's avatar
cleanup  
Baber committed
276
        self._parser.add_argument(
Baber's avatar
Baber committed
277
278
279
280
281
282
283
            "--metadata",
            type=json.loads,
            default=None,
            help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
        )

    def execute(self, args: argparse.Namespace) -> None:
Baber's avatar
cleanup  
Baber committed
284
        """Runs the evaluation harness with the provided arguments."""
Baber's avatar
Baber committed
285
286
287
288
        from lm_eval.config.evaluate_config import EvaluatorConfig

        # Create and validate config (most validation now happens in EvaluationConfig)
        cfg = EvaluatorConfig.from_cli(args)
Baber's avatar
Baber committed
289

Baber's avatar
Baber committed
290
        from lm_eval import simple_evaluate, utils
Baber's avatar
Baber committed
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
        from lm_eval.loggers import EvaluationTracker, WandbLogger
        from lm_eval.utils import handle_non_serializable, make_table

        # Set up logging
        if cfg.wandb_args:
            wandb_logger = WandbLogger(cfg.wandb_args, cfg.wandb_config_args)

        utils.setup_logging(cfg.verbosity)
        eval_logger = logging.getLogger(__name__)
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

        # Set up evaluation tracker
        if cfg.output_path:
            cfg.hf_hub_log_args["output_path"] = cfg.output_path

        if os.environ.get("HF_TOKEN", None):
            cfg.hf_hub_log_args["token"] = os.environ.get("HF_TOKEN")

        evaluation_tracker = EvaluationTracker(**cfg.hf_hub_log_args)

        # Create task manager (metadata already set up in config validation)
Baber's avatar
Baber committed
312
        task_manager = cfg.process_tasks()
Baber's avatar
Baber committed
313
314
315
316
317
318
319
320
321
322
323
324
325

        # Validation warnings (keep these in CLI as they're logging-specific)
        if "push_samples_to_hub" in cfg.hf_hub_log_args and not cfg.log_samples:
            eval_logger.warning(
                "Pushing samples to the Hub requires --log_samples to be set."
            )

        # Log task selection (tasks already processed in config)
        if cfg.include_path is not None:
            eval_logger.info(f"Including path: {cfg.include_path}")
        eval_logger.info(f"Selected Tasks: {cfg.tasks}")

        # Run evaluation
Baber's avatar
Baber committed
326
        results = simple_evaluate(
Baber's avatar
Baber committed
327
328
329
330
331
332
333
334
            model=cfg.model,
            model_args=cfg.model_args,
            tasks=cfg.tasks,
            num_fewshot=cfg.num_fewshot,
            batch_size=cfg.batch_size,
            max_batch_size=cfg.max_batch_size,
            device=cfg.device,
            use_cache=cfg.use_cache,
Baber's avatar
Baber committed
335
336
            cache_requests=cfg.cache_requests.get("cache_requests", False),
            rewrite_requests_cache=cfg.cache_requests.get(
Baber's avatar
Baber committed
337
338
                "rewrite_requests_cache", False
            ),
Baber's avatar
Baber committed
339
            delete_requests_cache=cfg.cache_requests.get(
Baber's avatar
Baber committed
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
                "delete_requests_cache", False
            ),
            limit=cfg.limit,
            samples=cfg.samples,
            check_integrity=cfg.check_integrity,
            write_out=cfg.write_out,
            log_samples=cfg.log_samples,
            evaluation_tracker=evaluation_tracker,
            system_instruction=cfg.system_instruction,
            apply_chat_template=cfg.apply_chat_template,
            fewshot_as_multiturn=cfg.fewshot_as_multiturn,
            gen_kwargs=cfg.gen_kwargs,
            task_manager=task_manager,
            verbosity=cfg.verbosity,
            predict_only=cfg.predict_only,
            random_seed=cfg.seed[0] if cfg.seed else None,
            numpy_random_seed=cfg.seed[1] if cfg.seed else None,
            torch_random_seed=cfg.seed[2] if cfg.seed else None,
            fewshot_random_seed=cfg.seed[3] if cfg.seed else None,
            confirm_run_unsafe_code=cfg.confirm_run_unsafe_code,
            metadata=cfg.metadata,
        )

        # Process results
        if results is not None:
            if cfg.log_samples:
                samples = results.pop("samples")

            dumped = json.dumps(
                results, indent=2, default=handle_non_serializable, ensure_ascii=False
            )
            if cfg.show_config:
                print(dumped)

            batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

            # W&B logging
            if cfg.wandb_args:
                try:
                    wandb_logger.post_init(results)
                    wandb_logger.log_eval_result()
                    if cfg.log_samples:
                        wandb_logger.log_eval_samples(samples)
                except Exception as e:
                    eval_logger.info(f"Logging to W&B failed: {e}")

            # Save results
            evaluation_tracker.save_results_aggregated(
                results=results, samples=samples if cfg.log_samples else None
            )

            if cfg.log_samples:
                for task_name, _ in results["configs"].items():
                    evaluation_tracker.save_results_samples(
                        task_name=task_name, samples=samples[task_name]
                    )

            if (
                evaluation_tracker.push_results_to_hub
                or evaluation_tracker.push_samples_to_hub
            ):
                evaluation_tracker.recreate_metadata_card()

            # Print results
            print(
                f"{cfg.model} ({cfg.model_args}), gen_kwargs: ({cfg.gen_kwargs}), "
                f"limit: {cfg.limit}, num_fewshot: {cfg.num_fewshot}, "
                f"batch_size: {cfg.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
            )
            print(make_table(results))
            if "groups" in results:
                print(make_table(results, "groups"))

            if cfg.wandb_args:
                wandb_logger.run.finish()