"LM-Evaluation-Harness-240310/lm_eval/api/instance.py" did not exist on "deb8370cf3b7ed7d3401bab887b214f130135594"
test_include_path.py 6.57 KB
Newer Older
1
2
3
4
5
import os

from lm_eval import tasks


6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def test_include_path_precedence():
    """Test that user-specified include paths take precedence over default paths when tasks have the same name."""
    import tempfile

    # Create a temporary directory for our custom task
    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a custom arc_easy.yaml that has a different metric
        custom_task_content = """task: arc_easy
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Custom Question: {{question}}\\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
  - metric: f1
    aggregation: mean
    higher_is_better: true
metadata:
  version: 2.0
  custom: true
"""

        # Write the custom task file
        custom_task_path = os.path.join(custom_dir, "arc_easy.yaml")
        with open(custom_task_path, "w") as f:
            f.write(custom_task_content)

        # Test 1: User path should override default when include_defaults=True
        task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)

        # Load the task
        task_dict = task_manager.load_task_or_group(["arc_easy"])
        arc_easy_task = task_dict["arc_easy"]

        # Check that the custom version was loaded (has f1 metric and custom doc_to_text)
        assert any(
            metric["metric"] == "f1" for metric in arc_easy_task.config["metric_list"]
        ), "Custom task should have f1 metric"
        assert "Custom Question:" in arc_easy_task.config["doc_to_text"], (
            "Custom task should have custom doc_to_text"
        )
        assert arc_easy_task.config["metadata"]["version"] == 2.0, (
            "Custom task should have version 2.0"
        )

        # Test 2: Verify default is used when no custom path is provided
        default_task_manager = tasks.TaskManager(include_defaults=True)
        default_task_dict = default_task_manager.load_task_or_group(["arc_easy"])
        default_arc_easy = default_task_dict["arc_easy"]

        # Default should not have f1 metric or custom text
        assert not any(
            metric["metric"] == "f1"
            for metric in default_arc_easy.config.get("metric_list", [])
        ), "Default task should not have f1 metric"
        assert "Custom Question:" not in default_arc_easy.config["doc_to_text"], (
            "Default task should not have custom doc_to_text"
        )


def test_include_defaults_false_with_custom_path():
    """Test that when include_defaults=False, only custom tasks are available."""
    import tempfile

    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a custom task using a real dataset
        custom_task_content = """task: custom_arc_task
dataset_path: allenai/ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Q: {{question}}\nA:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
metadata:
  version: 1.0
  custom: true
"""

        # Write the custom task file
        custom_task_path = os.path.join(custom_dir, "custom_arc_task.yaml")
        with open(custom_task_path, "w") as f:
            f.write(custom_task_content)

        # Initialize with include_defaults=False
        task_manager = tasks.TaskManager(
            include_defaults=False, include_path=custom_dir
        )

        # Custom task should be available
        assert "custom_arc_task" in task_manager.all_tasks, (
            "Custom task should be available when include_defaults=False"
        )

        # Default tasks should NOT be available
        assert "arc_easy" not in task_manager.all_tasks, (
            "Default arc_easy should not be available when include_defaults=False"
        )
        assert "arc_challenge" not in task_manager.all_tasks, (
            "Default arc_challenge should not be available when include_defaults=False"
        )

        # Check that only our custom task is present
        assert len(task_manager.all_tasks) == 1, (
            f"Should only have 1 task, but found {len(task_manager.all_tasks)}"
        )

        # Check task metadata is correctly loaded
        task_info = task_manager.task_index["custom_arc_task"]
        assert task_info["type"] == "task"
        assert custom_dir in task_info["yaml_path"]


def test_include_defaults_true_with_new_tasks():
    """Test that new tasks from include_path are added alongside default tasks."""
    import tempfile

    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a completely new task (not overriding any default)
        new_task_content = """task: arc_custom_generation
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: generate_until
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nGenerate answer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
generation_kwargs:
  max_gen_toks: 50
  temperature: 0.1
  until:
    - "\n"
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
metadata:
  version: 1.0
  custom_benchmark: true
"""

        # Write the new task file
        new_task_path = os.path.join(custom_dir, "arc_custom_generation.yaml")
        with open(new_task_path, "w") as f:
            f.write(new_task_content)

        # Initialize with include_defaults=True (default behavior)
        task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)

        # Both custom and default tasks should be available
        assert "arc_custom_generation" in task_manager.all_tasks, (
            "New custom task should be available"
        )
        assert "arc_easy" in task_manager.all_tasks, (
            "Default arc_easy should still be available"
        )
        assert "arc_challenge" in task_manager.all_tasks, (
            "Default arc_challenge should still be available"
        )

        # Check task metadata
        custom_task_info = task_manager.task_index["arc_custom_generation"]
        assert custom_task_info["type"] == "task"
        assert custom_dir in custom_task_info["yaml_path"]

        # Verify the counts - should have more tasks than just defaults
        default_only_manager = tasks.TaskManager(include_defaults=True)
        assert len(task_manager.all_tasks) > len(default_only_manager.all_tasks), (
            "Should have more tasks when including custom path"
        )