test_include_path.py

import os

from lm_eval import tasks


def test_include_path_precedence():
    """Test that user-specified include paths take precedence over default paths when tasks have the same name."""
    import tempfile

    # Create a temporary directory for our custom task
    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a custom arc_easy.yaml that has a different metric
        custom_task_content = """task: arc_easy
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Custom Question: {{question}}\\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
  - metric: f1
    aggregation: mean
    higher_is_better: true
metadata:
  version: 2.0
  custom: true
"""

        # Write the custom task file
        custom_task_path = os.path.join(custom_dir, "arc_easy.yaml")
        with open(custom_task_path, "w") as f:
            f.write(custom_task_content)

        # Test 1: User path should override default when include_defaults=True
        task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)

        # Load the task
        task_dict = task_manager.load_task_or_group(["arc_easy"])
        arc_easy_task = task_dict["arc_easy"]

        # Check that the custom version was loaded (has f1 metric and custom doc_to_text)
        assert any(
            metric["metric"] == "f1" for metric in arc_easy_task.config["metric_list"]
        ), "Custom task should have f1 metric"
        assert "Custom Question:" in arc_easy_task.config["doc_to_text"], (
            "Custom task should have custom doc_to_text"
        )
        assert arc_easy_task.config["metadata"]["version"] == 2.0, (
            "Custom task should have version 2.0"
        )

        # Test 2: Verify default is used when no custom path is provided
        default_task_manager = tasks.TaskManager(include_defaults=True)
        default_task_dict = default_task_manager.load_task_or_group(["arc_easy"])
        default_arc_easy = default_task_dict["arc_easy"]

        # Default should not have f1 metric or custom text
        assert not any(
            metric["metric"] == "f1"
            for metric in default_arc_easy.config.get("metric_list", [])
        ), "Default task should not have f1 metric"
        assert "Custom Question:" not in default_arc_easy.config["doc_to_text"], (
            "Default task should not have custom doc_to_text"
        )


def test_include_defaults_false_with_custom_path():
    """Test that when include_defaults=False, only custom tasks are available."""
    import tempfile

    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a custom task using a real dataset
        custom_task_content = """task: custom_arc_task
dataset_path: allenai/ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Q: {{question}}\nA:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
metadata:
  version: 1.0
  custom: true
"""

        # Write the custom task file
        custom_task_path = os.path.join(custom_dir, "custom_arc_task.yaml")
        with open(custom_task_path, "w") as f:
            f.write(custom_task_content)

        # Initialize with include_defaults=False
        task_manager = tasks.TaskManager(
            include_defaults=False, include_path=custom_dir
        )

        # Custom task should be available
        assert "custom_arc_task" in task_manager.all_tasks, (
            "Custom task should be available when include_defaults=False"
        )

        # Default tasks should NOT be available
        assert "arc_easy" not in task_manager.all_tasks, (
            "Default arc_easy should not be available when include_defaults=False"
        )
        assert "arc_challenge" not in task_manager.all_tasks, (
            "Default arc_challenge should not be available when include_defaults=False"
        )

        # Check that only our custom task is present
        assert len(task_manager.all_tasks) == 1, (
            f"Should only have 1 task, but found {len(task_manager.all_tasks)}"
        )

        # Check task metadata is correctly loaded
        task_info = task_manager.task_index["custom_arc_task"]
        assert task_info["type"] == "task"
        assert custom_dir in task_info["yaml_path"]


def test_include_defaults_true_with_new_tasks():
    """Test that new tasks from include_path are added alongside default tasks."""
    import tempfile

    with tempfile.TemporaryDirectory() as custom_dir:
        # Create a completely new task (not overriding any default)
        new_task_content = """task: arc_custom_generation
dataset_path: allenai/ai2_arc
dataset_name: ARC-Easy
output_type: generate_until
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nGenerate answer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
generation_kwargs:
  max_gen_toks: 50
  temperature: 0.1
  until:
    - "\n"
metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
metadata:
  version: 1.0
  custom_benchmark: true
"""

        # Write the new task file
        new_task_path = os.path.join(custom_dir, "arc_custom_generation.yaml")
        with open(new_task_path, "w") as f:
            f.write(new_task_content)

        # Initialize with include_defaults=True (default behavior)
        task_manager = tasks.TaskManager(include_defaults=True, include_path=custom_dir)

        # Both custom and default tasks should be available
        assert "arc_custom_generation" in task_manager.all_tasks, (
            "New custom task should be available"
        )
        assert "arc_easy" in task_manager.all_tasks, (
            "Default arc_easy should still be available"
        )
        assert "arc_challenge" in task_manager.all_tasks, (
            "Default arc_challenge should still be available"
        )

        # Check task metadata
        custom_task_info = task_manager.task_index["arc_custom_generation"]
        assert custom_task_info["type"] == "task"
        assert custom_dir in custom_task_info["yaml_path"]

        # Verify the counts - should have more tasks than just defaults
        default_only_manager = tasks.TaskManager(include_defaults=True)
        assert len(task_manager.all_tasks) > len(default_only_manager.all_tasks), (
            "Should have more tasks when including custom path"
        )