validate.py 4.87 KB
Newer Older
Baber's avatar
Baber committed
1
2
import argparse
import sys
Baber's avatar
Baber committed
3
import textwrap
Baber's avatar
Baber committed
4

Baber's avatar
Baber committed
5
from lm_eval._cli.subcommand import SubCommand
Baber's avatar
Baber committed
6
7


Baber's avatar
Baber committed
8
class Validate(SubCommand):
Baber's avatar
Baber committed
9
10
11
    """Command for validating tasks."""

    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
Baber's avatar
Baber committed
12
        # Create and configure the self._parser
Baber's avatar
Baber committed
13
        super().__init__(*args, **kwargs)
Baber's avatar
Baber committed
14
        self._parser = subparsers.add_parser(
Baber's avatar
Baber committed
15
16
17
            "validate",
            help="Validate task configurations",
            description="Validate task configurations and check for errors.",
Baber's avatar
Baber committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
            usage="lm-eval validate --tasks <task1,task2> [--include_path DIR]",
            epilog=textwrap.dedent("""
                examples:
                  # Validate a single task
                  lm-eval validate --tasks hellaswag

                  # Validate multiple tasks
                  lm-eval validate --tasks arc_easy,arc_challenge,hellaswag

                  # Validate a task group
                  lm-eval validate --tasks mmlu

                  # Validate tasks with external definitions
                  lm-eval validate --tasks my_custom_task --include_path ./custom_tasks

                  # Validate tasks from multiple external paths
                  lm-eval validate --tasks custom_task1,custom_task2 --include_path "/path/to/tasks1:/path/to/tasks2"
Baber's avatar
Baber committed
35

Baber's avatar
Baber committed
36
37
38
39
40
41
42
43
44
                validation check:
                  The validate command performs several checks:
                  • Task existence: Verifies all specified tasks are available
                  • Configuration syntax: Checks YAML/JSON configuration files
                  • Dataset access: Validates dataset paths and configurations
                  • Required fields: Ensures all mandatory task parameters are present
                  • Metric definitions: Verifies metric functions and aggregation methods
                  • Filter pipelines: Validates filter chains and their parameters
                  • Template rendering: Tests prompt templates with sample data
Baber's avatar
Baber committed
45

Baber's avatar
Baber committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
                task config files:
                  Tasks are defined using YAML configuration files with these key sections:
                  • task: Task name and metadata
                  • dataset_path: HuggingFace dataset identifier
                  • doc_to_text: Template for converting documents to prompts
                  • doc_to_target: Template for extracting target answers
                  • metric_list: List of evaluation metrics to compute
                  • output_type: Type of model output (loglikelihood, generate_until, etc.)
                  • filter_list: Post-processing filters for model outputs

                common errors:
                  • Missing required fields in YAML configuration
                  • Invalid dataset paths or missing dataset splits
                  • Malformed Jinja2 templates in doc_to_text/doc_to_target
                  • Undefined metrics or aggregation functions
                  • Invalid filter names or parameters
                  • Circular dependencies in task inheritance
                  • Missing external task files when using --include_path

                debugging tips:
                  • Use --include_path to test external task definitions
                  • Check task configuration files for syntax errors
                  • Verify dataset access and authentication if needed
                  • Use 'lm-eval list tasks' to see available tasks

                For task configuration guide, see: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md
            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
        self._add_args()
Baber's avatar
Baber committed
76
        self._parser.set_defaults(func=self._execute)
Baber's avatar
Baber committed
77

Baber's avatar
Baber committed
78
79
    def _add_args(self) -> None:
        self._parser.add_argument(
Baber's avatar
Baber committed
80
81
82
83
            "--tasks",
            "-t",
            required=True,
            type=str,
Baber's avatar
Baber committed
84
            metavar="TASK1,TASK2",
Baber's avatar
Baber committed
85
86
            help="Comma-separated list of task names to validate",
        )
Baber's avatar
Baber committed
87
        self._parser.add_argument(
Baber's avatar
Baber committed
88
89
90
91
92
93
94
            "--include_path",
            type=str,
            default=None,
            metavar="DIR",
            help="Additional path to include if there are external tasks.",
        )

Baber's avatar
Baber committed
95
    def _execute(self, args: argparse.Namespace) -> None:
Baber's avatar
Baber committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
        """Execute the validate command."""
        from lm_eval.tasks import TaskManager

        task_manager = TaskManager(include_path=args.include_path)
        task_list = args.tasks.split(",")

        print(f"Validating tasks: {task_list}")
        # For now, just validate that tasks exist
        task_names = task_manager.match_tasks(task_list)
        task_missing = [task for task in task_list if task not in task_names]

        if task_missing:
            missing = ", ".join(task_missing)
            print(f"Tasks not found: {missing}")
            sys.exit(1)
        else:
            print("All tasks found and valid")