Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
93cf95d0
Unverified
Commit
93cf95d0
authored
Feb 24, 2022
by
Leo Gao
Committed by
GitHub
Feb 24, 2022
Browse files
Merge pull request #290 from StephenHogg/check_integrity
Add integrity check
parents
3c37ea9c
40c1e05c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
44 additions
and
3 deletions
+44
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+8
-2
lm_eval/utils.py
lm_eval/utils.py
+33
-0
main.py
main.py
+3
-1
No files found.
lm_eval/evaluator.py
View file @
93cf95d0
import
collections
import
itertools
import
pathlib
import
random
import
lm_eval.metrics
import
lm_eval.models
import
lm_eval.tasks
import
lm_eval.base
import
numpy
as
np
from
lm_eval.utils
import
positional_deprecated
from
lm_eval.utils
import
positional_deprecated
,
run_task_tests
@
positional_deprecated
def
simple_evaluate
(
model
,
model_args
=
None
,
tasks
=
[],
num_fewshot
=
0
,
batch_size
=
None
,
device
=
None
,
no_cache
=
False
,
limit
=
None
,
bootstrap_iters
=
100000
,
description_dict
=
None
):
description_dict
=
None
,
check_integrity
=
False
):
"""Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM]
...
...
@@ -37,6 +38,8 @@ def simple_evaluate(model, model_args=None, tasks=[],
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:return
Dictionary of results
"""
...
...
@@ -61,6 +64,9 @@ def simple_evaluate(model, model_args=None, tasks=[],
task_dict
=
lm_eval
.
tasks
.
get_task_dict
(
tasks
)
if
check_integrity
:
run_task_tests
(
task_list
=
tasks
)
results
=
evaluate
(
lm
=
lm
,
task_dict
=
task_dict
,
...
...
lm_eval/utils.py
View file @
93cf95d0
import
os
import
pathlib
import
re
import
collections
import
functools
import
inspect
import
sys
import
pytest
from
typing
import
List
class
ExitCodeError
(
Exception
):
...
...
@@ -155,3 +159,32 @@ def positional_deprecated(fn):
"lm-evaluation-harness!"
)
return
fn
(
*
args
,
**
kwargs
)
return
_wrapper
@
positional_deprecated
def
find_test_root
(
start_path
:
pathlib
.
Path
)
->
pathlib
.
Path
:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path
=
start_path
.
resolve
()
max_layers
=
3
for
_
in
range
(
max_layers
):
if
(
cur_path
/
'tests'
/
'test_version_stable.py'
).
exists
():
return
cur_path
else
:
cur_path
=
cur_path
.
parent
.
resolve
()
raise
FileNotFoundError
(
f
"Unable to find package root within
{
max_layers
}
upwards"
+
\
f
"of
{
start_path
}
"
)
@
positional_deprecated
def
run_task_tests
(
task_list
:
List
[
str
]):
"""
Find the package root and run the tests for the given tasks
"""
package_root
=
find_test_root
(
start_path
=
pathlib
.
Path
(
__file__
))
task_string
=
' or '
.
join
(
task_list
)
args
=
[
f
'
{
package_root
}
/tests/test_version_stable.py'
,
f
'--rootdir=
{
package_root
}
'
,
'-k'
,
f
'
{
task_string
}
'
]
sys
.
path
.
append
(
str
(
package_root
))
pytest_return_val
=
pytest
.
main
(
args
)
if
pytest_return_val
:
raise
ValueError
(
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
)
\ No newline at end of file
main.py
View file @
93cf95d0
...
...
@@ -20,6 +20,7 @@ def parse_args():
parser
.
add_argument
(
'--limit'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--no_cache'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--description_dict_path'
,
default
=
None
)
parser
.
add_argument
(
'--check_integrity'
,
action
=
"store_true"
)
return
parser
.
parse_args
()
...
...
@@ -49,7 +50,8 @@ def main():
device
=
args
.
device
,
no_cache
=
args
.
no_cache
,
limit
=
args
.
limit
,
description_dict
=
description_dict
description_dict
=
description_dict
,
check_integrity
=
args
.
check_integrity
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment