Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1fb90b91
Commit
1fb90b91
authored
Feb 12, 2021
by
&
Browse files
metrics file
parent
f7992789
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
147 additions
and
75 deletions
+147
-75
.gitignore
.gitignore
+1
-0
lm_eval/base.py
lm_eval/base.py
+13
-58
lm_eval/metrics.py
lm_eval/metrics.py
+97
-0
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+2
-1
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+2
-1
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+2
-1
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+3
-1
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+2
-1
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+2
-1
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+2
-1
lm_eval/tasks/pubmedqa.py
lm_eval/tasks/pubmedqa.py
+2
-1
lm_eval/tasks/qa4mre.py
lm_eval/tasks/qa4mre.py
+2
-1
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+2
-1
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-1
lm_eval/tasks/sciq.py
lm_eval/tasks/sciq.py
+2
-1
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+2
-1
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+2
-1
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+3
-1
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+2
-1
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+2
-1
No files found.
.gitignore
View file @
1fb90b91
env
*.pyc
data/
.idea
\ No newline at end of file
lm_eval/base.py
View file @
1fb90b91
import
abc
import
random
import
numpy
as
np
import
sklearn
import
m
ath
from
lm_eval.metrics
import
m
ean
class
LM
(
abc
.
ABC
):
...
...
@@ -30,6 +30,7 @@ class LM(abc.ABC):
"""
pass
# TODO: Add an optional max length
@
abc
.
abstractmethod
def
greedy_until
(
self
,
requests
):
"""Generate greedily until a stopping sequence
...
...
@@ -61,6 +62,14 @@ class LM(abc.ABC):
class
Task
(
abc
.
ABC
):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def
__init__
(
self
):
self
.
download
()
self
.
_training_docs
=
None
...
...
@@ -148,9 +157,9 @@ class Task(abc.ABC):
@
abc
.
abstractmethod
def
aggregation
(
self
):
"""
:returns: {str: [
float
] -> float}
:returns: {str: [
metric_score
] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metric
score
s
"""
pass
...
...
@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task):
}
def
mean
(
arr
):
return
sum
(
arr
)
/
len
(
arr
)
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
def
f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
sklearn
.
metrics
.
f1_score
(
golds
,
preds
)
return
np
.
max
(
fscore
)
def
acc_all
(
items
):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict
=
{}
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
for
doc
,
pred
in
zip
(
docs
,
preds
):
question_id
=
doc
[
"idx"
][
"question"
]
if
question_id
not
in
question_scoring_dict
:
question_scoring_dict
[
question_id
]
=
[]
gold_label
=
doc
[
"label"
]
==
1
question_scoring_dict
[
question_id
].
append
(
gold_label
==
pred
)
acc
=
np
.
mean
([
int
(
all
(
x
))
for
x
in
question_scoring_dict
.
values
()])
return
acc
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
perplexity
(
items
):
return
math
.
exp
(
-
mean
(
items
))
req_ret_lens
=
{
'loglikelihood'
:
2
,
'greedy_until'
:
None
,
...
...
lm_eval/metrics.py
0 → 100644
View file @
1fb90b91
import
math
import
numpy
as
np
import
sacrebleu
import
sklearn
def
mean
(
arr
):
return
sum
(
arr
)
/
len
(
arr
)
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
def
f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
sklearn
.
metrics
.
f1_score
(
golds
,
preds
)
return
np
.
max
(
fscore
)
def
acc_all
(
items
):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict
=
{}
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
for
doc
,
pred
in
zip
(
docs
,
preds
):
question_id
=
doc
[
"idx"
][
"question"
]
if
question_id
not
in
question_scoring_dict
:
question_scoring_dict
[
question_id
]
=
[]
gold_label
=
doc
[
"label"
]
==
1
question_scoring_dict
[
question_id
].
append
(
gold_label
==
pred
)
acc
=
np
.
mean
([
int
(
all
(
x
))
for
x
in
question_scoring_dict
.
values
()])
return
acc
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
perplexity
(
items
):
return
math
.
exp
(
-
mean
(
items
))
def
bleu
(
items
):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
pass
def
chrf
(
items
):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
pass
def
ter
(
items
):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
pass
lm_eval/tasks/anli.py
View file @
1fb90b91
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
class
ANLIBase
(
HFTask
):
...
...
lm_eval/tasks/arc.py
View file @
1fb90b91
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
...
...
lm_eval/tasks/arithmetic.py
View file @
1fb90b91
...
...
@@ -2,7 +2,8 @@ import abc
import
json
import
os
from
collections
import
namedtuple
from
lm_eval.base
import
Task
,
mean
,
rf
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
from
best_download
import
download_file
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
...
...
lm_eval/tasks/common.py
View file @
1fb90b91
import
datasets
import
numpy
as
np
import
lm_eval.metrics
from
..base
import
Task
...
...
@@ -44,7 +46,7 @@ class HFTask(Task):
def
simple_accuracy_metric
(
preds
,
golds
):
acc
=
float
(
(
np
.
array
(
preds
)
==
np
.
array
(
golds
))
.
mean
())
acc
=
float
(
lm_eval
.
metrics
.
mean
())
return
{
"major"
:
acc
,
"minor"
:
{
"acc"
:
acc
},
...
...
lm_eval/tasks/glue.py
View file @
1fb90b91
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
,
f1_score
,
matthews_corrcoef
from
lm_eval.base
import
rf
from
..metrics
import
mean
,
matthews_corrcoef
,
f1_score
from
scipy.stats
import
pearsonr
,
spearmanr
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
yesno
...
...
lm_eval/tasks/lambada.py
View file @
1fb90b91
from
lm_eval.base
import
Task
,
rf
,
mean
,
perplexity
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
,
perplexity
from
lm_eval.utils
import
sh
import
json
import
math
...
...
lm_eval/tasks/piqa.py
View file @
1fb90b91
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
...
...
lm_eval/tasks/pubmedqa.py
View file @
1fb90b91
...
...
@@ -2,7 +2,8 @@ import numpy as np
import
json
import
random
from
.common
import
HFTask
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
class
Pubmed_QA
(
HFTask
):
...
...
lm_eval/tasks/qa4mre.py
View file @
1fb90b91
import
os
import
numpy
as
np
from
best_download
import
download_file
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
lm_eval.metrics
import
mean
import
xml.etree.ElementTree
as
ET
import
random
...
...
lm_eval/tasks/race.py
View file @
1fb90b91
import
collections
import
datasets
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
import
os
...
...
lm_eval/tasks/sat.py
View file @
1fb90b91
import
json
import
random
import
os
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
..metrics
import
mean
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
simple_accuracy_metric
import
numpy
as
np
...
...
lm_eval/tasks/sciq.py
View file @
1fb90b91
import
os
import
json
from
..utils
import
sh
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
..metrics
import
mean
import
zipfile
from
best_download
import
download_file
...
...
lm_eval/tasks/superglue.py
View file @
1fb90b91
...
...
@@ -5,7 +5,8 @@ To-do:
"""
import
numpy
as
np
from
.
common
import
HFTask
,
yesno
from
lm_eval.base
import
rf
,
mean
,
acc_all
,
metric_max_over_ground_truths
from
lm_eval.base
import
rf
from
..metrics
import
mean
,
acc_all
,
metric_max_over_ground_truths
import
sklearn
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
..utils
import
general_detokenize
...
...
lm_eval/tasks/triviaqa.py
View file @
1fb90b91
import
os
import
json
import
random
from
lm_eval.base
import
Task
,
mean
,
rf
from
lm_eval.base
import
Task
,
rf
from
..metrics
import
mean
from
..utils
import
sh
class
TriviaQA
(
Task
):
...
...
lm_eval/tasks/webqs.py
View file @
1fb90b91
from
.
common
import
HFTask
from
lm_eval.base
import
mean
,
rf
from
lm_eval.base
import
rf
from
..metrics
import
mean
class
WebQs
(
HFTask
):
DATASET_PATH
=
"web_questions"
...
...
lm_eval/tasks/winogrande.py
View file @
1fb90b91
import
numpy
as
np
from
.
common
import
HFTask
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
"""
This evaluation of Winogrande uses partial evaluation as described by
...
...
lm_eval/tasks/wsc273.py
View file @
1fb90b91
import
numpy
as
np
import
random
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment