test_taskmanager.py

#!/usr/bin/env python3
"""
Walkthrough tests using real dataset configurations.

These tests use YAML configs with existing datasets (hellaswag) to enable
complete code walkthrough of the task loading system, including:
- Basic task loading
- Task list functionality
- Group functionality
- Include inheritance
- Issue #2158 fix (include processing preserving task names)
"""

import os

import pytest

from lm_eval.tasks import TaskManager, get_task_dict


class TestWalkthroughConfigs:
    """Test walkthrough configurations for easier code demonstration"""

    @pytest.fixture(autouse=True)
    def setup_task_manager(self):
        """Set up TaskManager with test configs directory"""
        test_configs_dir = os.path.join(os.path.dirname(__file__), "test_configs")
        self.tm = TaskManager(include_path=test_configs_dir, include_defaults=False)

    def test_simple_task_loading(self):
        """Test basic task loading - walkthrough starting point"""
        # Simple task should be indexed
        assert "simple_task" in self.tm.all_tasks
        assert self.tm._name_is_task("simple_task")

        # Load the task
        task_dict = get_task_dict(["simple_task"], task_manager=self.tm)
        assert "simple_task" in task_dict

        # Verify task configuration
        task_obj = task_dict["simple_task"]
        assert hasattr(task_obj, "config")
        assert task_obj.config.task == "simple_task"

    def test_task_list_functionality(self):
        """Test task_list feature - multiple tasks sharing config"""

        # All task_list tasks should be indexed as individual tasks
        expected_tasks = ["task_list_fs0", "task_list_fs1", "task_list_fs3"]

        for task_name in expected_tasks:
            assert task_name in self.tm.all_tasks, f"Task {task_name} not indexed"
            assert self.tm._name_is_task(task_name), (
                f"Task {task_name} not recognized as task"
            )

        # Load all tasks from the task_list
        task_dict = get_task_dict(expected_tasks, task_manager=self.tm)

        # Each should be a separate task object
        assert len(task_dict) == 3
        for task_name in expected_tasks:
            assert task_name in task_dict
            task_obj = task_dict[task_name]
            assert task_obj.config.task == task_name

        # Verify different num_fewshot values were applied
        assert task_dict["task_list_fs0"].config.num_fewshot == 0
        assert task_dict["task_list_fs1"].config.num_fewshot == 1
        assert task_dict["task_list_fs3"].config.num_fewshot == 3

    def test_group_functionality(self):
        """Test group loading with task-specific overrides"""

        # Group should be indexed
        assert "test_group" in self.tm.all_groups
        assert self.tm._name_is_group("test_group")

        # Load the group
        task_dict = get_task_dict(["test_group"], task_manager=self.tm)

        # Should contain the group object and its subtasks
        assert len(task_dict) == 1
        group_obj = list(task_dict.keys())[0]
        subtasks = task_dict[group_obj]

        # Check expected subtasks
        expected_subtasks = ["group_task_fs0", "group_task_fs2"]
        for subtask_name in expected_subtasks:
            assert subtask_name in subtasks

        # Verify different configurations were applied
        fs0_task = subtasks["group_task_fs0"]
        fs2_task = subtasks["group_task_fs2"]
        assert fs0_task.config.num_fewshot == 0
        assert fs2_task.config.num_fewshot == 2

    def test_include_inheritance(self):
        """Test include functionality and inheritance"""

        # Test direct include tasks (these were created as separate files)
        include_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]

        for task_name in include_tasks:
            assert task_name in self.tm.all_tasks

        # Load tasks that use include
        task_dict = get_task_dict(
            include_tasks[:1], task_manager=self.tm
        )  # Just test first one

        # Should inherit from base config
        task_obj = task_dict["include_task_fs0"]
        # Should inherit dataset_path from include
        assert task_obj.config.dataset_path == "json"
        # Should inherit output_type from include
        assert task_obj.config.output_type == "multiple_choice"
        # Should preserve specific task name (not base_task_name)
        assert task_obj.config.task == "include_task_fs0"
        # Should have overridden num_fewshot
        assert task_obj.config.num_fewshot == 0

    def test_issue_2158_fix_demo(self):
        """
        Test issue #2158 fix - multiple tasks with same include in group.

        This demonstrates the specific scenario that was failing before the fix.
        """

        # Group with multiple tasks using same include should work
        assert "include_group" in self.tm.all_groups

        # This should NOT raise a duplicate detection error
        # Before the fix, this would fail with:
        # "Please call groups which overlap their constituent tasks in separate evaluation runs"
        task_dict = get_task_dict(["include_group"], task_manager=self.tm)

        # Should successfully load the group
        assert len(task_dict) == 1
        group_obj = list(task_dict.keys())[0]
        subtasks = task_dict[group_obj]

        # Check all expected tasks are present with correct names
        expected_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]

        for task_name in expected_tasks:
            assert task_name in subtasks, f"Task {task_name} missing from group"
            task_obj = subtasks[task_name]

            # CRITICAL: Task name should be preserved, not overwritten by include
            assert task_obj.config.task == task_name

            # Should inherit base config from include
            assert task_obj.config.dataset_path == "json"
            assert task_obj.config.output_type == "multiple_choice"

        # Verify different num_fewshot values
        assert subtasks["include_task_fs0"].config.num_fewshot == 0
        assert subtasks["include_task_fs1"].config.num_fewshot == 1
        assert subtasks["include_task_fs5"].config.num_fewshot == 5

    def test_config_types_detection(self):
        """Test that different config types are correctly detected"""

        # Load various config types to test detection methods
        configs = [
            # Simple task config
            {"task": "walkthrough_simple_task"},
            # Group config
            {"group": "test_group", "task": ["task1", "task2"]},
            # Task list config (would need to be loaded from file)
        ]

        # Test config detection methods
        assert self.tm._config_is_task(configs[0])
        assert not self.tm._config_is_group()
        assert not self.tm._config_is_task_list(configs[0])

        assert not self.tm._config_is_task(configs[1])
        assert self.tm._config_is_group()
        assert not self.tm._config_is_task_list(configs[1])

        # Test task_list detection with actual config
        task_list_config = {"task_list": [{"task": "task1"}, {"task": "task2"}]}
        assert self.tm._config_is_task_list(task_list_config)
        assert not self.tm._config_is_task(task_list_config)
        assert not self.tm._config_is_group()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])