validate.py 4.87 KB
Newer Older
Baber's avatar
Baber committed
1
2
import argparse
import sys
Baber's avatar
cleanup  
Baber committed
3
import textwrap
Baber's avatar
Baber committed
4

Baber's avatar
cleanup  
Baber committed
5
from lm_eval._cli.subcommand import SubCommand
Baber's avatar
Baber committed
6
7


Baber's avatar
cleanup  
Baber committed
8
class Validate(SubCommand):
Baber's avatar
Baber committed
9
10
11
    """Command for validating tasks."""

    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
Baber's avatar
cleanup  
Baber committed
12
        # Create and configure the self._parser
Baber's avatar
Baber committed
13
        super().__init__(*args, **kwargs)
Baber's avatar
cleanup  
Baber committed
14
        self._parser = subparsers.add_parser(
Baber's avatar
Baber committed
15
16
17
            "validate",
            help="Validate task configurations",
            description="Validate task configurations and check for errors.",
Baber's avatar
cleanup  
Baber committed
18
            usage="lm-eval validate --tasks <task1,task2> [--include_path DIR]",
Baber's avatar
cleanup  
Baber committed
19
20
21
22
            epilog=textwrap.dedent("""
                examples:
                  # Validate a single task
                  lm-eval validate --tasks hellaswag
Baber's avatar
Baber committed
23

Baber's avatar
cleanup  
Baber committed
24
25
                  # Validate multiple tasks
                  lm-eval validate --tasks arc_easy,arc_challenge,hellaswag
Baber's avatar
Baber committed
26

Baber's avatar
cleanup  
Baber committed
27
28
                  # Validate a task group
                  lm-eval validate --tasks mmlu
Baber's avatar
Baber committed
29

Baber's avatar
cleanup  
Baber committed
30
31
                  # Validate tasks with external definitions
                  lm-eval validate --tasks my_custom_task --include_path ./custom_tasks
Baber's avatar
Baber committed
32

Baber's avatar
cleanup  
Baber committed
33
34
                  # Validate tasks from multiple external paths
                  lm-eval validate --tasks custom_task1,custom_task2 --include_path "/path/to/tasks1:/path/to/tasks2"
Baber's avatar
Baber committed
35

Baber's avatar
cleanup  
Baber committed
36
37
38
39
40
41
42
43
44
                validation check:
                  The validate command performs several checks:
                  • Task existence: Verifies all specified tasks are available
                  • Configuration syntax: Checks YAML/JSON configuration files
                  • Dataset access: Validates dataset paths and configurations
                  • Required fields: Ensures all mandatory task parameters are present
                  • Metric definitions: Verifies metric functions and aggregation methods
                  • Filter pipelines: Validates filter chains and their parameters
                  • Template rendering: Tests prompt templates with sample data
Baber's avatar
Baber committed
45

Baber's avatar
cleanup  
Baber committed
46
47
48
49
50
51
52
53
54
                task config files:
                  Tasks are defined using YAML configuration files with these key sections:
                  • task: Task name and metadata
                  • dataset_path: HuggingFace dataset identifier
                  • doc_to_text: Template for converting documents to prompts
                  • doc_to_target: Template for extracting target answers
                  • metric_list: List of evaluation metrics to compute
                  • output_type: Type of model output (loglikelihood, generate_until, etc.)
                  • filter_list: Post-processing filters for model outputs
Baber's avatar
Baber committed
55

Baber's avatar
cleanup  
Baber committed
56
57
58
59
60
61
62
63
                common errors:
                  • Missing required fields in YAML configuration
                  • Invalid dataset paths or missing dataset splits
                  • Malformed Jinja2 templates in doc_to_text/doc_to_target
                  • Undefined metrics or aggregation functions
                  • Invalid filter names or parameters
                  • Circular dependencies in task inheritance
                  • Missing external task files when using --include_path
Baber's avatar
Baber committed
64

Baber's avatar
cleanup  
Baber committed
65
66
67
68
69
                debugging tips:
                  • Use --include_path to test external task definitions
                  • Check task configuration files for syntax errors
                  • Verify dataset access and authentication if needed
                  • Use 'lm-eval list tasks' to see available tasks
Baber's avatar
Baber committed
70

Baber's avatar
cleanup  
Baber committed
71
72
                For task configuration guide, see: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md
            """),
Baber's avatar
Baber committed
73
74
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
Baber's avatar
cleanup  
Baber committed
75
        self._add_args()
Baber's avatar
nit  
Baber committed
76
        self._parser.set_defaults(func=self._execute)
Baber's avatar
Baber committed
77

Baber's avatar
cleanup  
Baber committed
78
79
    def _add_args(self) -> None:
        self._parser.add_argument(
Baber's avatar
Baber committed
80
81
82
83
            "--tasks",
            "-t",
            required=True,
            type=str,
Baber's avatar
cleanup  
Baber committed
84
            metavar="TASK1,TASK2",
Baber's avatar
Baber committed
85
86
            help="Comma-separated list of task names to validate",
        )
Baber's avatar
cleanup  
Baber committed
87
        self._parser.add_argument(
Baber's avatar
Baber committed
88
89
90
91
92
93
94
            "--include_path",
            type=str,
            default=None,
            metavar="DIR",
            help="Additional path to include if there are external tasks.",
        )

Baber's avatar
nit  
Baber committed
95
    def _execute(self, args: argparse.Namespace) -> None:
Baber's avatar
Baber committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
        """Execute the validate command."""
        from lm_eval.tasks import TaskManager

        task_manager = TaskManager(include_path=args.include_path)
        task_list = args.tasks.split(",")

        print(f"Validating tasks: {task_list}")
        # For now, just validate that tasks exist
        task_names = task_manager.match_tasks(task_list)
        task_missing = [task for task in task_list if task not in task_names]

        if task_missing:
            missing = ", ".join(task_missing)
            print(f"Tasks not found: {missing}")
            sys.exit(1)
        else:
            print("All tasks found and valid")