Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
45c11c31
Commit
45c11c31
authored
Jul 14, 2025
by
Baber
Browse files
add: create new YAML configurations for task and group setups
parent
0aca6958
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
191 additions
and
0 deletions
+191
-0
tests/test_taskmanager.py
tests/test_taskmanager.py
+191
-0
No files found.
tests/test_taskmanager.py
0 → 100644
View file @
45c11c31
#!/usr/bin/env python3
"""
Walkthrough tests using real dataset configurations.
These tests use YAML configs with existing datasets (hellaswag) to enable
complete code walkthrough of the task loading system, including:
- Basic task loading
- Task list functionality
- Group functionality
- Include inheritance
- Issue #2158 fix (include processing preserving task names)
"""
import
os
import
pytest
from
lm_eval.tasks
import
TaskManager
,
get_task_dict
class
TestWalkthroughConfigs
:
"""Test walkthrough configurations for easier code demonstration"""
@
pytest
.
fixture
(
autouse
=
True
)
def
setup_task_manager
(
self
):
"""Set up TaskManager with test configs directory"""
test_configs_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"test_configs"
)
self
.
tm
=
TaskManager
(
include_path
=
test_configs_dir
,
include_defaults
=
False
)
def
test_simple_task_loading
(
self
):
"""Test basic task loading - walkthrough starting point"""
# Simple task should be indexed
assert
"simple_task"
in
self
.
tm
.
all_tasks
assert
self
.
tm
.
_name_is_task
(
"simple_task"
)
# Load the task
task_dict
=
get_task_dict
([
"simple_task"
],
task_manager
=
self
.
tm
)
assert
"simple_task"
in
task_dict
# Verify task configuration
task_obj
=
task_dict
[
"simple_task"
]
assert
hasattr
(
task_obj
,
"config"
)
assert
task_obj
.
config
.
task
==
"simple_task"
def
test_task_list_functionality
(
self
):
"""Test task_list feature - multiple tasks sharing config"""
# All task_list tasks should be indexed as individual tasks
expected_tasks
=
[
"task_list_fs0"
,
"task_list_fs1"
,
"task_list_fs3"
]
for
task_name
in
expected_tasks
:
assert
task_name
in
self
.
tm
.
all_tasks
,
f
"Task
{
task_name
}
not indexed"
assert
self
.
tm
.
_name_is_task
(
task_name
),
(
f
"Task
{
task_name
}
not recognized as task"
)
# Load all tasks from the task_list
task_dict
=
get_task_dict
(
expected_tasks
,
task_manager
=
self
.
tm
)
# Each should be a separate task object
assert
len
(
task_dict
)
==
3
for
task_name
in
expected_tasks
:
assert
task_name
in
task_dict
task_obj
=
task_dict
[
task_name
]
assert
task_obj
.
config
.
task
==
task_name
# Verify different num_fewshot values were applied
assert
task_dict
[
"task_list_fs0"
].
config
.
num_fewshot
==
0
assert
task_dict
[
"task_list_fs1"
].
config
.
num_fewshot
==
1
assert
task_dict
[
"task_list_fs3"
].
config
.
num_fewshot
==
3
def
test_group_functionality
(
self
):
"""Test group loading with task-specific overrides"""
# Group should be indexed
assert
"test_group"
in
self
.
tm
.
all_groups
assert
self
.
tm
.
_name_is_group
(
"test_group"
)
# Load the group
task_dict
=
get_task_dict
([
"test_group"
],
task_manager
=
self
.
tm
)
# Should contain the group object and its subtasks
assert
len
(
task_dict
)
==
1
group_obj
=
list
(
task_dict
.
keys
())[
0
]
subtasks
=
task_dict
[
group_obj
]
# Check expected subtasks
expected_subtasks
=
[
"group_task_fs0"
,
"group_task_fs2"
]
for
subtask_name
in
expected_subtasks
:
assert
subtask_name
in
subtasks
# Verify different configurations were applied
fs0_task
=
subtasks
[
"group_task_fs0"
]
fs2_task
=
subtasks
[
"group_task_fs2"
]
assert
fs0_task
.
config
.
num_fewshot
==
0
assert
fs2_task
.
config
.
num_fewshot
==
2
def
test_include_inheritance
(
self
):
"""Test include functionality and inheritance"""
# Test direct include tasks (these were created as separate files)
include_tasks
=
[
"include_task_fs0"
,
"include_task_fs1"
,
"include_task_fs5"
]
for
task_name
in
include_tasks
:
assert
task_name
in
self
.
tm
.
all_tasks
# Load tasks that use include
task_dict
=
get_task_dict
(
include_tasks
[:
1
],
task_manager
=
self
.
tm
)
# Just test first one
# Should inherit from base config
task_obj
=
task_dict
[
"include_task_fs0"
]
# Should inherit dataset_path from include
assert
task_obj
.
config
.
dataset_path
==
"json"
# Should inherit output_type from include
assert
task_obj
.
config
.
output_type
==
"multiple_choice"
# Should preserve specific task name (not base_task_name)
assert
task_obj
.
config
.
task
==
"include_task_fs0"
# Should have overridden num_fewshot
assert
task_obj
.
config
.
num_fewshot
==
0
def
test_issue_2158_fix_demo
(
self
):
"""
Test issue #2158 fix - multiple tasks with same include in group.
This demonstrates the specific scenario that was failing before the fix.
"""
# Group with multiple tasks using same include should work
assert
"include_group"
in
self
.
tm
.
all_groups
# This should NOT raise a duplicate detection error
# Before the fix, this would fail with:
# "Please call groups which overlap their constituent tasks in separate evaluation runs"
task_dict
=
get_task_dict
([
"include_group"
],
task_manager
=
self
.
tm
)
# Should successfully load the group
assert
len
(
task_dict
)
==
1
group_obj
=
list
(
task_dict
.
keys
())[
0
]
subtasks
=
task_dict
[
group_obj
]
# Check all expected tasks are present with correct names
expected_tasks
=
[
"include_task_fs0"
,
"include_task_fs1"
,
"include_task_fs5"
]
for
task_name
in
expected_tasks
:
assert
task_name
in
subtasks
,
f
"Task
{
task_name
}
missing from group"
task_obj
=
subtasks
[
task_name
]
# CRITICAL: Task name should be preserved, not overwritten by include
assert
task_obj
.
config
.
task
==
task_name
# Should inherit base config from include
assert
task_obj
.
config
.
dataset_path
==
"json"
assert
task_obj
.
config
.
output_type
==
"multiple_choice"
# Verify different num_fewshot values
assert
subtasks
[
"include_task_fs0"
].
config
.
num_fewshot
==
0
assert
subtasks
[
"include_task_fs1"
].
config
.
num_fewshot
==
1
assert
subtasks
[
"include_task_fs5"
].
config
.
num_fewshot
==
5
def
test_config_types_detection
(
self
):
"""Test that different config types are correctly detected"""
# Load various config types to test detection methods
configs
=
[
# Simple task config
{
"task"
:
"walkthrough_simple_task"
},
# Group config
{
"group"
:
"test_group"
,
"task"
:
[
"task1"
,
"task2"
]},
# Task list config (would need to be loaded from file)
]
# Test config detection methods
assert
self
.
tm
.
_config_is_task
(
configs
[
0
])
assert
not
self
.
tm
.
_config_is_group
(
configs
[
0
])
assert
not
self
.
tm
.
_config_is_task_list
(
configs
[
0
])
assert
not
self
.
tm
.
_config_is_task
(
configs
[
1
])
assert
self
.
tm
.
_config_is_group
(
configs
[
1
])
assert
not
self
.
tm
.
_config_is_task_list
(
configs
[
1
])
# Test task_list detection with actual config
task_list_config
=
{
"task_list"
:
[{
"task"
:
"task1"
},
{
"task"
:
"task2"
}]}
assert
self
.
tm
.
_config_is_task_list
(
task_list_config
)
assert
not
self
.
tm
.
_config_is_task
(
task_list_config
)
assert
not
self
.
tm
.
_config_is_group
(
task_list_config
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
,
"-v"
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment