Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
4bb77e82
Unverified
Commit
4bb77e82
authored
Jun 11, 2024
by
Hailey Schoelkopf
Committed by
GitHub
Jun 11, 2024
Browse files
add include_defaults kwarg to taskmanager, add tests for include_path (#1856)
parent
d0f6e011
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
137 additions
and
7 deletions
+137
-7
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+23
-7
tests/test_include_path.py
tests/test_include_path.py
+93
-0
tests/testconfigs/arc_test.yaml
tests/testconfigs/arc_test.yaml
+21
-0
No files found.
lm_eval/tasks/__init__.py
View file @
4bb77e82
...
...
@@ -14,27 +14,43 @@ class TaskManager:
"""
def
__init__
(
self
,
verbosity
=
"INFO"
,
include_path
:
Optional
[
str
]
=
None
)
->
None
:
def
__init__
(
self
,
verbosity
=
"INFO"
,
include_path
:
Optional
[
Union
[
str
,
List
]]
=
None
,
include_defaults
:
bool
=
True
,
)
->
None
:
self
.
verbosity
=
verbosity
self
.
include_path
=
include_path
self
.
logger
=
utils
.
eval_logger
self
.
logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
self
.
_task_index
=
self
.
initialize_tasks
(
include_path
=
include_path
)
self
.
_task_index
=
self
.
initialize_tasks
(
include_path
=
include_path
,
include_defaults
=
include_defaults
)
self
.
_all_tasks
=
sorted
(
list
(
self
.
_task_index
.
keys
()))
self
.
task_group_map
=
collections
.
defaultdict
(
list
)
def
initialize_tasks
(
self
,
include_path
:
Optional
[
str
]
=
None
):
def
initialize_tasks
(
self
,
include_path
:
Optional
[
Union
[
str
,
List
]]
=
None
,
include_defaults
:
bool
=
True
,
):
"""Creates a dictionary of tasks index.
:param include_path: str = None
An additional path to be searched for tasks
:param include_path: Union[str, List] = None
An additional path to be searched for tasks recursively.
Can provide more than one such path as a list.
:param include_defaults: bool = True
If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
:return
Dictionary of task names as key and task metadata
"""
all_paths
=
[
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
]
if
include_defaults
:
all_paths
=
[
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
]
else
:
all_paths
=
[]
if
include_path
is
not
None
:
if
isinstance
(
include_path
,
str
):
include_path
=
[
include_path
]
...
...
tests/test_include_path.py
0 → 100644
View file @
4bb77e82
import
os
import
pytest
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
from
lm_eval
import
tasks
@
pytest
.
mark
.
parametrize
(
"limit,model,model_args"
,
[
(
10
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
),
],
)
def
test_include_correctness
(
limit
:
int
,
model
:
str
,
model_args
:
str
):
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
()
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
e1
=
evaluator
.
simple_evaluate
(
model
=
model
,
tasks
=
task_name
,
limit
=
limit
,
model_args
=
model_args
,
)
assert
e1
is
not
None
# run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
lm
=
api
.
registry
.
get_model
(
model
).
create_from_arg_string
(
model_args
,
{
"batch_size"
:
None
,
"max_batch_size"
:
None
,
"device"
:
None
,
},
)
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
(
include_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/testconfigs"
,
include_defaults
=
False
,
)
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
e2
=
evaluator
.
evaluate
(
lm
=
lm
,
task_dict
=
task_dict
,
limit
=
limit
,
)
assert
e2
is
not
None
# check that caching is working
def
r
(
x
):
return
x
[
"results"
][
"arc_easy"
]
assert
all
(
x
==
y
for
x
,
y
in
zip
([
y
for
_
,
y
in
r
(
e1
).
items
()],
[
y
for
_
,
y
in
r
(
e2
).
items
()])
)
# test that setting include_defaults = False works as expected and that include_path works
def
test_no_include_defaults
():
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
(
include_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/testconfigs"
,
include_defaults
=
False
,
)
# should succeed, because we've included an 'arc_easy' task from this dir
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
# should fail, since ./testconfigs has no arc_challenge task
task_name
=
[
"arc_challenge"
]
with
pytest
.
raises
(
KeyError
):
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
# noqa: F841
# test that include_path containing a task shadowing another task's name fails
# def test_shadowed_name_fails():
# task_name = ["arc_easy"]
# task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
# task_dict = tasks.get_task_dict(task_name, task_manager)
tests/testconfigs/arc_test.yaml
0 → 100644
View file @
4bb77e82
task
:
arc_easy
dataset_path
:
allenai/ai2_arc
dataset_name
:
ARC-Easy
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
doc_to_text
:
"
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{choices.label.index(answerKey)}}"
doc_to_choice
:
"
{{choices.text}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
"
Question:
{{question}}
\n
Answer:"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment