Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
eb9f6788
Commit
eb9f6788
authored
Jul 04, 2024
by
JessicaOjo
Browse files
pr review changes
parent
0dcdfb80
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
77 deletions
+6
-77
lm_eval/api/metrics.py
lm_eval/api/metrics.py
+0
-22
lm_eval/api/task.py
lm_eval/api/task.py
+6
-18
lm_eval/filters/extraction.py
lm_eval/filters/extraction.py
+0
-37
No files found.
lm_eval/api/metrics.py
View file @
eb9f6788
...
...
@@ -58,19 +58,6 @@ def f1_score(items):
return
np
.
max
(
fscore
)
@
register_aggregation
(
"squad_f1"
)
def
squad_f1_score
(
items
):
gold_squad
,
pred_squad
=
[],
[]
for
index
,
(
ref
,
pred
)
in
enumerate
(
items
):
pred_dict
=
{
'prediction_text'
:
str
(
pred
),
'id'
:
str
(
index
)}
ref_dict
=
{
'answers'
:
{
'answer_start'
:
[
0
],
'text'
:
[
str
(
ref
)]},
'id'
:
str
(
index
)}
gold_squad
.
append
(
ref_dict
)
pred_squad
.
append
(
pred_dict
)
squad_metric
=
hf_evaluate
.
load
(
"squad"
)
results_squad
=
squad_metric
.
compute
(
predictions
=
pred_squad
,
references
=
gold_squad
)
return
results_squad
[
'f1'
]
/
100
@
register_aggregation
(
"matthews_corrcoef"
)
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
...
...
@@ -192,15 +179,6 @@ def exact_match_fn(**kwargs):
return
exact_match
.
compute
(
**
kwargs
)
@
register_metric
(
metric
=
"squad"
,
higher_is_better
=
True
,
output_type
=
"generate_until"
,
aggregation
=
"squad_f1"
)
def
squad_fn
(
items
):
return
items
@
register_metric
(
metric
=
"perplexity"
,
higher_is_better
=
False
,
...
...
lm_eval/api/task.py
View file @
eb9f6788
...
...
@@ -1417,7 +1417,6 @@ class ConfigurableTask(Task):
**
({
"acc"
:
acc
}
if
"acc"
in
use_metric
else
{}),
**
({
"f1"
:
(
gold
,
pred
)}
if
"f1"
in
use_metric
else
{}),
**
({
"mcc"
:
(
gold
,
pred
)}
if
"mcc"
in
use_metric
else
{}),
**
({
"squad"
:
(
gold
,
pred
)}
if
"squad"
in
use_metric
else
{}),
**
({
"acc_norm"
:
acc_norm
}
if
"acc_norm"
in
use_metric
else
{}),
**
({
"exact_match"
:
exact_match
}
if
"exact_match"
in
use_metric
else
{}),
**
(
...
...
@@ -1437,13 +1436,10 @@ class ConfigurableTask(Task):
gold
=
self
.
doc_to_target
(
doc
)
result
=
results
[
0
]
if
self
.
config
.
doc_to_choice
is
not
None
:
try
:
# If you set doc_to_choice,
# it assumes that doc_to_target returns a number.
choices
=
self
.
doc_to_choice
(
doc
)
gold
=
choices
[
gold
]
except
TypeError
:
gold
=
gold
# If you set doc_to_choice,
# it assumes that doc_to_target returns a number.
choices
=
self
.
doc_to_choice
(
doc
)
gold
=
choices
[
gold
]
# we expect multiple_targets to be a list.
elif
self
.
multiple_target
:
gold
=
list
(
gold
)
...
...
@@ -1492,20 +1488,12 @@ class ConfigurableTask(Task):
result_score
=
0.0
else
:
try
:
# adds exact match logic
if
metric
==
"exact_match"
:
result_score
=
self
.
_metric_fn_list
[
metric
](
references
=
[
str
(
gold
)],
predictions
=
[
str
(
result
)],
**
self
.
_metric_fn_kwargs
[
metric
],
)
else
:
result_score
=
self
.
_metric_fn_list
[
metric
](
result_score
=
self
.
_metric_fn_list
[
metric
](
references
=
[
gold
],
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
metric
],
)
except
TypeError
as
error
:
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
except
TypeError
:
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score
=
self
.
_metric_fn_list
[
metric
]([
gold
,
result
])
if
isinstance
(
result_score
,
dict
):
# TODO: this handles the case where HF evaluate returns a dict.
...
...
lm_eval/filters/extraction.py
View file @
eb9f6788
...
...
@@ -49,43 +49,6 @@ class RegexFilter(Filter):
return
filtered_resps
@
register_filter
(
"verbalizer"
)
class
VerbalizerFilter
(
Filter
):
""" """
def
__init__
(
self
,
verbalizer_dict
:
dict
,
)
->
None
:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
"""
self
.
verbalizer_dict
=
verbalizer_dict
def
apply
(
self
,
resps
,
docs
):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def
verbalize
(
value
):
for
key
,
values
in
self
.
verbalizer_dict
.
items
():
for
v
in
values
:
if
v
in
value
:
return
key
return
value
def
filter_value
(
inst
):
filtered
=
[]
for
resp
in
inst
:
match
=
verbalize
(
resp
.
lower
())
filtered
.
append
(
match
)
return
filtered
filtered_resps
=
map
(
lambda
x
:
filter_value
(
x
),
resps
)
return
filtered_resps
@
register_filter
(
"remove_whitespace"
)
class
WhitespaceFilter
(
Filter
):
""" """
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment