add: create new YAML configurations for task and group setups

45c11c31 · Baber · 0aca6958 · 45c11c31
Commit 45c11c31 authored Jul 14, 2025 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 191 additions and 0 deletions

tests/test_taskmanager.py tests/test_taskmanager.py +191 -0

No files found.
--- a/tests/test_taskmanager.py
+++ b/tests/test_taskmanager.py
+#!/usr/bin/env python3
+"""
+Walkthrough tests using real dataset configurations.
+
+These tests use YAML configs with existing datasets (hellaswag) to enable
+complete code walkthrough of the task loading system, including:
+- Basic task loading
+- Task list functionality
+- Group functionality
+- Include inheritance
+- Issue #2158 fix (include processing preserving task names)
+"""
+
+import os
+
+import pytest
+
+from lm_eval.tasks import TaskManager, get_task_dict
+
+
+class TestWalkthroughConfigs:
+    """Test walkthrough configurations for easier code demonstration"""
+
+    @pytest.fixture(autouse=True)
+    def setup_task_manager(self):
+        """Set up TaskManager with test configs directory"""
+        test_configs_dir = os.path.join(os.path.dirname(__file__), "test_configs")
+        self.tm = TaskManager(include_path=test_configs_dir, include_defaults=False)
+
+    def test_simple_task_loading(self):
+        """Test basic task loading - walkthrough starting point"""
+        # Simple task should be indexed
+        assert "simple_task" in self.tm.all_tasks
+        assert self.tm._name_is_task("simple_task")
+
+        # Load the task
+        task_dict = get_task_dict(["simple_task"], task_manager=self.tm)
+        assert "simple_task" in task_dict
+
+        # Verify task configuration
+        task_obj = task_dict["simple_task"]
+        assert hasattr(task_obj, "config")
+        assert task_obj.config.task == "simple_task"
+
+    def test_task_list_functionality(self):
+        """Test task_list feature - multiple tasks sharing config"""
+
+        # All task_list tasks should be indexed as individual tasks
+        expected_tasks = ["task_list_fs0", "task_list_fs1", "task_list_fs3"]
+
+        for task_name in expected_tasks:
+            assert task_name in self.tm.all_tasks, f"Task {task_name} not indexed"
+            assert self.tm._name_is_task(task_name), (
+                f"Task {task_name} not recognized as task"
+            )
+
+        # Load all tasks from the task_list
+        task_dict = get_task_dict(expected_tasks, task_manager=self.tm)
+
+        # Each should be a separate task object
+        assert len(task_dict) == 3
+        for task_name in expected_tasks:
+            assert task_name in task_dict
+            task_obj = task_dict[task_name]
+            assert task_obj.config.task == task_name
+
+        # Verify different num_fewshot values were applied
+        assert task_dict["task_list_fs0"].config.num_fewshot == 0
+        assert task_dict["task_list_fs1"].config.num_fewshot == 1
+        assert task_dict["task_list_fs3"].config.num_fewshot == 3
+
+    def test_group_functionality(self):
+        """Test group loading with task-specific overrides"""
+
+        # Group should be indexed
+        assert "test_group" in self.tm.all_groups
+        assert self.tm._name_is_group("test_group")
+
+        # Load the group
+        task_dict = get_task_dict(["test_group"], task_manager=self.tm)
+
+        # Should contain the group object and its subtasks
+        assert len(task_dict) == 1
+        group_obj = list(task_dict.keys())[0]
+        subtasks = task_dict[group_obj]
+
+        # Check expected subtasks
+        expected_subtasks = ["group_task_fs0", "group_task_fs2"]
+        for subtask_name in expected_subtasks:
+            assert subtask_name in subtasks
+
+        # Verify different configurations were applied
+        fs0_task = subtasks["group_task_fs0"]
+        fs2_task = subtasks["group_task_fs2"]
+        assert fs0_task.config.num_fewshot == 0
+        assert fs2_task.config.num_fewshot == 2
+
+    def test_include_inheritance(self):
+        """Test include functionality and inheritance"""
+
+        # Test direct include tasks (these were created as separate files)
+        include_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]
+
+        for task_name in include_tasks:
+            assert task_name in self.tm.all_tasks
+
+        # Load tasks that use include
+        task_dict = get_task_dict(
+            include_tasks[:1], task_manager=self.tm
+        )  # Just test first one
+
+        # Should inherit from base config
+        task_obj = task_dict["include_task_fs0"]
+        # Should inherit dataset_path from include
+        assert task_obj.config.dataset_path == "json"
+        # Should inherit output_type from include
+        assert task_obj.config.output_type == "multiple_choice"
+        # Should preserve specific task name (not base_task_name)
+        assert task_obj.config.task == "include_task_fs0"
+        # Should have overridden num_fewshot
+        assert task_obj.config.num_fewshot == 0
+
+    def test_issue_2158_fix_demo(self):
+        """
+        Test issue #2158 fix - multiple tasks with same include in group.
+
+        This demonstrates the specific scenario that was failing before the fix.
+        """
+
+        # Group with multiple tasks using same include should work
+        assert "include_group" in self.tm.all_groups
+
+        # This should NOT raise a duplicate detection error
+        # Before the fix, this would fail with:
+        # "Please call groups which overlap their constituent tasks in separate evaluation runs"
+        task_dict = get_task_dict(["include_group"], task_manager=self.tm)
+
+        # Should successfully load the group
+        assert len(task_dict) == 1
+        group_obj = list(task_dict.keys())[0]
+        subtasks = task_dict[group_obj]
+
+        # Check all expected tasks are present with correct names
+        expected_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]
+
+        for task_name in expected_tasks:
+            assert task_name in subtasks, f"Task {task_name} missing from group"
+            task_obj = subtasks[task_name]
+
+            # CRITICAL: Task name should be preserved, not overwritten by include
+            assert task_obj.config.task == task_name
+
+            # Should inherit base config from include
+            assert task_obj.config.dataset_path == "json"
+            assert task_obj.config.output_type == "multiple_choice"
+
+        # Verify different num_fewshot values
+        assert subtasks["include_task_fs0"].config.num_fewshot == 0
+        assert subtasks["include_task_fs1"].config.num_fewshot == 1
+        assert subtasks["include_task_fs5"].config.num_fewshot == 5
+
+    def test_config_types_detection(self):
+        """Test that different config types are correctly detected"""
+
+        # Load various config types to test detection methods
+        configs = [
+            # Simple task config
+            {"task": "walkthrough_simple_task"},
+            # Group config
+            {"group": "test_group", "task": ["task1", "task2"]},
+            # Task list config (would need to be loaded from file)
+        ]
+
+        # Test config detection methods
+        assert self.tm._config_is_task(configs[0])
+        assert not self.tm._config_is_group(configs[0])
+        assert not self.tm._config_is_task_list(configs[0])
+
+        assert not self.tm._config_is_task(configs[1])
+        assert self.tm._config_is_group(configs[1])
+        assert not self.tm._config_is_task_list(configs[1])
+
+        # Test task_list detection with actual config
+        task_list_config = {"task_list": [{"task": "task1"}, {"task": "task2"}]}
+        assert self.tm._config_is_task_list(task_list_config)
+        assert not self.tm._config_is_task(task_list_config)
+        assert not self.tm._config_is_group(task_list_config)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])