Commit 45c11c31 authored by Baber's avatar Baber
Browse files

add: create new YAML configurations for task and group setups

parent 0aca6958
#!/usr/bin/env python3
"""
Walkthrough tests using real dataset configurations.
These tests use YAML configs with existing datasets (hellaswag) to enable
complete code walkthrough of the task loading system, including:
- Basic task loading
- Task list functionality
- Group functionality
- Include inheritance
- Issue #2158 fix (include processing preserving task names)
"""
import os
import pytest
from lm_eval.tasks import TaskManager, get_task_dict
class TestWalkthroughConfigs:
"""Test walkthrough configurations for easier code demonstration"""
@pytest.fixture(autouse=True)
def setup_task_manager(self):
"""Set up TaskManager with test configs directory"""
test_configs_dir = os.path.join(os.path.dirname(__file__), "test_configs")
self.tm = TaskManager(include_path=test_configs_dir, include_defaults=False)
def test_simple_task_loading(self):
"""Test basic task loading - walkthrough starting point"""
# Simple task should be indexed
assert "simple_task" in self.tm.all_tasks
assert self.tm._name_is_task("simple_task")
# Load the task
task_dict = get_task_dict(["simple_task"], task_manager=self.tm)
assert "simple_task" in task_dict
# Verify task configuration
task_obj = task_dict["simple_task"]
assert hasattr(task_obj, "config")
assert task_obj.config.task == "simple_task"
def test_task_list_functionality(self):
"""Test task_list feature - multiple tasks sharing config"""
# All task_list tasks should be indexed as individual tasks
expected_tasks = ["task_list_fs0", "task_list_fs1", "task_list_fs3"]
for task_name in expected_tasks:
assert task_name in self.tm.all_tasks, f"Task {task_name} not indexed"
assert self.tm._name_is_task(task_name), (
f"Task {task_name} not recognized as task"
)
# Load all tasks from the task_list
task_dict = get_task_dict(expected_tasks, task_manager=self.tm)
# Each should be a separate task object
assert len(task_dict) == 3
for task_name in expected_tasks:
assert task_name in task_dict
task_obj = task_dict[task_name]
assert task_obj.config.task == task_name
# Verify different num_fewshot values were applied
assert task_dict["task_list_fs0"].config.num_fewshot == 0
assert task_dict["task_list_fs1"].config.num_fewshot == 1
assert task_dict["task_list_fs3"].config.num_fewshot == 3
def test_group_functionality(self):
"""Test group loading with task-specific overrides"""
# Group should be indexed
assert "test_group" in self.tm.all_groups
assert self.tm._name_is_group("test_group")
# Load the group
task_dict = get_task_dict(["test_group"], task_manager=self.tm)
# Should contain the group object and its subtasks
assert len(task_dict) == 1
group_obj = list(task_dict.keys())[0]
subtasks = task_dict[group_obj]
# Check expected subtasks
expected_subtasks = ["group_task_fs0", "group_task_fs2"]
for subtask_name in expected_subtasks:
assert subtask_name in subtasks
# Verify different configurations were applied
fs0_task = subtasks["group_task_fs0"]
fs2_task = subtasks["group_task_fs2"]
assert fs0_task.config.num_fewshot == 0
assert fs2_task.config.num_fewshot == 2
def test_include_inheritance(self):
"""Test include functionality and inheritance"""
# Test direct include tasks (these were created as separate files)
include_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]
for task_name in include_tasks:
assert task_name in self.tm.all_tasks
# Load tasks that use include
task_dict = get_task_dict(
include_tasks[:1], task_manager=self.tm
) # Just test first one
# Should inherit from base config
task_obj = task_dict["include_task_fs0"]
# Should inherit dataset_path from include
assert task_obj.config.dataset_path == "json"
# Should inherit output_type from include
assert task_obj.config.output_type == "multiple_choice"
# Should preserve specific task name (not base_task_name)
assert task_obj.config.task == "include_task_fs0"
# Should have overridden num_fewshot
assert task_obj.config.num_fewshot == 0
def test_issue_2158_fix_demo(self):
"""
Test issue #2158 fix - multiple tasks with same include in group.
This demonstrates the specific scenario that was failing before the fix.
"""
# Group with multiple tasks using same include should work
assert "include_group" in self.tm.all_groups
# This should NOT raise a duplicate detection error
# Before the fix, this would fail with:
# "Please call groups which overlap their constituent tasks in separate evaluation runs"
task_dict = get_task_dict(["include_group"], task_manager=self.tm)
# Should successfully load the group
assert len(task_dict) == 1
group_obj = list(task_dict.keys())[0]
subtasks = task_dict[group_obj]
# Check all expected tasks are present with correct names
expected_tasks = ["include_task_fs0", "include_task_fs1", "include_task_fs5"]
for task_name in expected_tasks:
assert task_name in subtasks, f"Task {task_name} missing from group"
task_obj = subtasks[task_name]
# CRITICAL: Task name should be preserved, not overwritten by include
assert task_obj.config.task == task_name
# Should inherit base config from include
assert task_obj.config.dataset_path == "json"
assert task_obj.config.output_type == "multiple_choice"
# Verify different num_fewshot values
assert subtasks["include_task_fs0"].config.num_fewshot == 0
assert subtasks["include_task_fs1"].config.num_fewshot == 1
assert subtasks["include_task_fs5"].config.num_fewshot == 5
def test_config_types_detection(self):
"""Test that different config types are correctly detected"""
# Load various config types to test detection methods
configs = [
# Simple task config
{"task": "walkthrough_simple_task"},
# Group config
{"group": "test_group", "task": ["task1", "task2"]},
# Task list config (would need to be loaded from file)
]
# Test config detection methods
assert self.tm._config_is_task(configs[0])
assert not self.tm._config_is_group(configs[0])
assert not self.tm._config_is_task_list(configs[0])
assert not self.tm._config_is_task(configs[1])
assert self.tm._config_is_group(configs[1])
assert not self.tm._config_is_task_list(configs[1])
# Test task_list detection with actual config
task_list_config = {"task_list": [{"task": "task1"}, {"task": "task2"}]}
assert self.tm._config_is_task_list(task_list_config)
assert not self.tm._config_is_task(task_list_config)
assert not self.tm._config_is_group(task_list_config)
if __name__ == "__main__":
pytest.main([__file__, "-v"])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment