harness.py 2.33 KB
Newer Older
Baber's avatar
Baber committed
1
2
import argparse
import sys
Baber's avatar
cleanup  
Baber committed
3
import textwrap
Baber's avatar
Baber committed
4

Baber's avatar
nit  
Baber committed
5
from lm_eval._cli.ls import List
Baber's avatar
cleanup  
Baber committed
6
from lm_eval._cli.run import Run
Baber's avatar
cleanup  
Baber committed
7
from lm_eval._cli.validate import Validate
Baber's avatar
Baber committed
8
9


Baber's avatar
nit  
Baber committed
10
class HarnessCLI:
Baber's avatar
cleanup  
Baber committed
11
    """Main CLI parser that manages all subcommands."""
Baber's avatar
Baber committed
12
13
14
15
16

    def __init__(self):
        self._parser = argparse.ArgumentParser(
            prog="lm-eval",
            description="Language Model Evaluation Harness",
Baber's avatar
cleanup  
Baber committed
17
18
19
20
            epilog=textwrap.dedent("""
                quick start:
                  # Basic evaluation
                  lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
Baber's avatar
Baber committed
21

Baber's avatar
cleanup  
Baber committed
22
                  # List available tasks
Baber's avatar
nit  
Baber committed
23
                  lm-eval ls tasks
Baber's avatar
Baber committed
24

Baber's avatar
cleanup  
Baber committed
25
26
                  # Validate task configurations
                  lm-eval validate --tasks hellaswag,arc_easy
Baber's avatar
Baber committed
27

Baber's avatar
cleanup  
Baber committed
28
29
30
                legacy compatibility:
                  The harness maintains backward compatibility with the original interface.
                  If no command is specified, 'run' is automatically inserted:
Baber's avatar
Baber committed
31

Baber's avatar
cleanup  
Baber committed
32
                  lm-eval --model hf --tasks hellaswag  # Equivalent to 'lm-eval run --model hf --tasks hellaswag'
Baber's avatar
Baber committed
33

Baber's avatar
cleanup  
Baber committed
34
35
36
                For documentation, visit: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
Baber's avatar
Baber committed
37
38
39
40
41
42
        )
        self._parser.set_defaults(func=lambda args: self._parser.print_help())
        self._subparsers = self._parser.add_subparsers(
            dest="command", help="Available commands", metavar="COMMAND"
        )
        Run.create(self._subparsers)
Baber's avatar
nit  
Baber committed
43
        List.create(self._subparsers)
Baber's avatar
cleanup  
Baber committed
44
        Validate.create(self._subparsers)
Baber's avatar
Baber committed
45
46
47
48

    def parse_args(self) -> argparse.Namespace:
        """Parse arguments using the main parser."""
        if len(sys.argv) > 2 and sys.argv[1] not in self._subparsers.choices:
Baber's avatar
cleanup  
Baber committed
49
            # Backward compatibility: arguments provided but no valid subcommand - insert 'run'
Baber's avatar
Baber committed
50
            sys.argv.insert(1, "run")
Baber's avatar
cleanup  
Baber committed
51
52
53
54
        elif len(sys.argv) == 2 and "run" in sys.argv:
            # if only 'run' is specified, ensure it is treated as a subcommand
            self._subparsers.choices["run"].print_help()
            sys.exit(0)
Baber's avatar
Baber committed
55
56
57
58
59
        return self._parser.parse_args()

    def execute(self, args: argparse.Namespace) -> None:
        """Main execution method that handles subcommands and legacy support."""
        args.func(args)