Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
bd028848
Commit
bd028848
authored
Jul 18, 2025
by
Baber
Browse files
Merge branch 'main' into metrics
# Conflicts: # tests/test_tasks.py
parents
6e48110e
56def33d
Changes
108
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
73 additions
and
56 deletions
+73
-56
lm_eval/tasks/medtext/utils.py
lm_eval/tasks/medtext/utils.py
+3
-1
lm_eval/tasks/meqsum/utils.py
lm_eval/tasks/meqsum/utils.py
+3
-1
lm_eval/tasks/mimic_repsum/utils.py
lm_eval/tasks/mimic_repsum/utils.py
+3
-1
lm_eval/tasks/mts_dialog/utils.py
lm_eval/tasks/mts_dialog/utils.py
+3
-1
lm_eval/tasks/olaph/utils.py
lm_eval/tasks/olaph/utils.py
+3
-1
lm_eval/utils.py
lm_eval/utils.py
+7
-2
tests/test_tasks.py
tests/test_tasks.py
+2
-49
tests/test_unitxt_tasks.py
tests/test_unitxt_tasks.py
+49
-0
No files found.
lm_eval/tasks/medtext/utils.py
View file @
bd028848
...
...
@@ -11,7 +11,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/tasks/meqsum/utils.py
View file @
bd028848
...
...
@@ -11,7 +11,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/tasks/mimic_repsum/utils.py
View file @
bd028848
...
...
@@ -15,7 +15,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py radgraph"
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/tasks/mts_dialog/utils.py
View file @
bd028848
...
...
@@ -11,7 +11,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/tasks/olaph/utils.py
View file @
bd028848
...
...
@@ -12,7 +12,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/utils.py
View file @
bd028848
...
...
@@ -579,10 +579,11 @@ def hash_dict_images(data_dict):
dict: A new dictionary with the same structure as `data_dict`, but with all
bytes and PIL.Image.Image objects replaced by their hashes.
"""
from
PIL
import
Image
def
_process_value
(
value
):
# Bytes -> hash
from
PIL
import
Image
if
isinstance
(
value
,
(
bytes
,
bytearray
)):
return
convert_bytes_to_hash
(
value
)
# PIL Image -> hash
...
...
@@ -603,4 +604,8 @@ def hash_dict_images(data_dict):
if
not
isinstance
(
data_dict
,
dict
):
raise
TypeError
(
"Input must be a dictionary"
)
return
{
key
:
_process_value
(
val
)
for
key
,
val
in
data_dict
.
items
()}
return
(
{
key
:
_process_value
(
val
)
for
key
,
val
in
data_dict
.
items
()}
if
importlib
.
util
.
find_spec
(
"PIL"
)
else
data_dict
)
tests/test_tasks.py
View file @
bd028848
...
...
@@ -46,12 +46,7 @@ def limit() -> int:
return
10
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
get_new_tasks_else_default
()),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestBaseTasks
:
class
BaseTasks
:
"""
Base class for testing tasks
"""
...
...
@@ -165,50 +160,8 @@ class TestBaseTasks:
task_class
(
get_new_tasks_else_default
()),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestNewTasksElseDefault
(
Test
BaseTasks
):
class
TestNewTasksElseDefault
(
BaseTasks
):
"""
Test class parameterized with a list of new/modified tasks
(or a set of default tasks if none have been modified)
"""
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
TestBaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
tests/test_unitxt_tasks.py
0 → 100644
View file @
bd028848
from
itertools
import
islice
import
pytest
from
lm_eval
import
tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
tests.test_tasks
import
BaseTasks
,
task_class
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
BaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment