eval.py 2.63 KB
Newer Older
Baber's avatar
Baber committed
1
2
import argparse
import sys
Baber's avatar
cleanup  
Baber committed
3
import textwrap
Baber's avatar
Baber committed
4

Baber's avatar
cleanup  
Baber committed
5
from lm_eval._cli.list import List
Baber's avatar
cleanup  
Baber committed
6
from lm_eval._cli.run import Run
Baber's avatar
cleanup  
Baber committed
7
from lm_eval._cli.validate import Validate
Baber's avatar
Baber committed
8
9


Baber's avatar
cleanup  
Baber committed
10
11
class Eval:
    """Main CLI parser that manages all subcommands."""
Baber's avatar
Baber committed
12
13
14
15
16

    def __init__(self):
        self._parser = argparse.ArgumentParser(
            prog="lm-eval",
            description="Language Model Evaluation Harness",
Baber's avatar
cleanup  
Baber committed
17
18
19
20
            epilog=textwrap.dedent("""
                quick start:
                  # Basic evaluation
                  lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
Baber's avatar
Baber committed
21

Baber's avatar
cleanup  
Baber committed
22
23
                  # List available tasks
                  lm-eval list tasks
Baber's avatar
Baber committed
24

Baber's avatar
cleanup  
Baber committed
25
26
                  # Validate task configurations
                  lm-eval validate --tasks hellaswag,arc_easy
Baber's avatar
Baber committed
27

Baber's avatar
cleanup  
Baber committed
28
29
30
                legacy compatibility:
                  The harness maintains backward compatibility with the original interface.
                  If no command is specified, 'run' is automatically inserted:
Baber's avatar
Baber committed
31

Baber's avatar
cleanup  
Baber committed
32
                  lm-eval --model hf --tasks hellaswag  # Equivalent to 'lm-eval run --model hf --tasks hellaswag'
Baber's avatar
Baber committed
33

Baber's avatar
cleanup  
Baber committed
34
35
36
                For documentation, visit: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
Baber's avatar
Baber committed
37
38
39
40
41
42
        )
        self._parser.set_defaults(func=lambda args: self._parser.print_help())
        self._subparsers = self._parser.add_subparsers(
            dest="command", help="Available commands", metavar="COMMAND"
        )
        Run.create(self._subparsers)
Baber's avatar
cleanup  
Baber committed
43
44
        List.create(self._subparsers)
        Validate.create(self._subparsers)
Baber's avatar
Baber committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

    def parse_args(self) -> argparse.Namespace:
        """Parse arguments using the main parser."""
        if len(sys.argv) > 2 and sys.argv[1] not in self._subparsers.choices:
            # Arguments provided but no valid subcommand - insert 'run'
            sys.argv.insert(1, "run")
        return self._parser.parse_args()

    def execute(self, args: argparse.Namespace) -> None:
        """Main execution method that handles subcommands and legacy support."""

        # Handle legacy task listing
        if hasattr(args, "tasks") and args.tasks in [
            "list",
            "list_groups",
            "list_subtasks",
            "list_tags",
        ]:
            print(
                f"'--tasks {args.tasks}' is no longer supported.\n"
                f"Use the 'list' command instead:\n",
                file=sys.stderr,
            )

            # Show list command help
            list_parser = self._subparsers.choices["list"]
            list_parser.print_help()
            sys.exit(1)

        args.func(args)