Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1fb90b91
"include/vscode:/vscode.git/clone" did not exist on "43c92cf3689fc7e78198cf156364c5520c6a6eb6"
Commit
1fb90b91
authored
Feb 12, 2021
by
&
Browse files
metrics file
parent
f7992789
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
147 additions
and
75 deletions
+147
-75
.gitignore
.gitignore
+1
-0
lm_eval/base.py
lm_eval/base.py
+13
-58
lm_eval/metrics.py
lm_eval/metrics.py
+97
-0
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+2
-1
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+2
-1
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+2
-1
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+3
-1
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+2
-1
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+2
-1
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+2
-1
lm_eval/tasks/pubmedqa.py
lm_eval/tasks/pubmedqa.py
+2
-1
lm_eval/tasks/qa4mre.py
lm_eval/tasks/qa4mre.py
+2
-1
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+2
-1
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-1
lm_eval/tasks/sciq.py
lm_eval/tasks/sciq.py
+2
-1
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+2
-1
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+2
-1
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+3
-1
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+2
-1
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+2
-1
No files found.
.gitignore
View file @
1fb90b91
env
env
*.pyc
*.pyc
data/
data/
.idea
\ No newline at end of file
lm_eval/base.py
View file @
1fb90b91
import
abc
import
abc
import
random
import
random
import
numpy
as
np
import
numpy
as
np
import
sklearn
import
m
ath
from
lm_eval.metrics
import
m
ean
class
LM
(
abc
.
ABC
):
class
LM
(
abc
.
ABC
):
...
@@ -30,6 +30,7 @@ class LM(abc.ABC):
...
@@ -30,6 +30,7 @@ class LM(abc.ABC):
"""
"""
pass
pass
# TODO: Add an optional max length
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
greedy_until
(
self
,
requests
):
def
greedy_until
(
self
,
requests
):
"""Generate greedily until a stopping sequence
"""Generate greedily until a stopping sequence
...
@@ -61,6 +62,14 @@ class LM(abc.ABC):
...
@@ -61,6 +62,14 @@ class LM(abc.ABC):
class
Task
(
abc
.
ABC
):
class
Task
(
abc
.
ABC
):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def
__init__
(
self
):
def
__init__
(
self
):
self
.
download
()
self
.
download
()
self
.
_training_docs
=
None
self
.
_training_docs
=
None
...
@@ -148,9 +157,9 @@ class Task(abc.ABC):
...
@@ -148,9 +157,9 @@ class Task(abc.ABC):
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
aggregation
(
self
):
def
aggregation
(
self
):
"""
"""
:returns: {str: [
float
] -> float}
:returns: {str: [
metric_score
] -> float}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metric
score
s
"""
"""
pass
pass
...
@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task):
...
@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task):
}
}
def
mean
(
arr
):
return
sum
(
arr
)
/
len
(
arr
)
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
def
f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
sklearn
.
metrics
.
f1_score
(
golds
,
preds
)
return
np
.
max
(
fscore
)
def
acc_all
(
items
):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict
=
{}
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
for
doc
,
pred
in
zip
(
docs
,
preds
):
question_id
=
doc
[
"idx"
][
"question"
]
if
question_id
not
in
question_scoring_dict
:
question_scoring_dict
[
question_id
]
=
[]
gold_label
=
doc
[
"label"
]
==
1
question_scoring_dict
[
question_id
].
append
(
gold_label
==
pred
)
acc
=
np
.
mean
([
int
(
all
(
x
))
for
x
in
question_scoring_dict
.
values
()])
return
acc
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
perplexity
(
items
):
return
math
.
exp
(
-
mean
(
items
))
req_ret_lens
=
{
req_ret_lens
=
{
'loglikelihood'
:
2
,
'loglikelihood'
:
2
,
'greedy_until'
:
None
,
'greedy_until'
:
None
,
...
...
lm_eval/metrics.py
0 → 100644
View file @
1fb90b91
import
math
import
numpy
as
np
import
sacrebleu
import
sklearn
def
mean
(
arr
):
return
sum
(
arr
)
/
len
(
arr
)
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
def
f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
sklearn
.
metrics
.
f1_score
(
golds
,
preds
)
return
np
.
max
(
fscore
)
def
acc_all
(
items
):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict
=
{}
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
for
doc
,
pred
in
zip
(
docs
,
preds
):
question_id
=
doc
[
"idx"
][
"question"
]
if
question_id
not
in
question_scoring_dict
:
question_scoring_dict
[
question_id
]
=
[]
gold_label
=
doc
[
"label"
]
==
1
question_scoring_dict
[
question_id
].
append
(
gold_label
==
pred
)
acc
=
np
.
mean
([
int
(
all
(
x
))
for
x
in
question_scoring_dict
.
values
()])
return
acc
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
perplexity
(
items
):
return
math
.
exp
(
-
mean
(
items
))
def
bleu
(
items
):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
preds
=
list
(
zip
(
*
items
))[
0
]
docs
=
list
(
zip
(
*
items
))[
1
]
pass
def
chrf
(
items
):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
pass
def
ter
(
items
):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
pass
lm_eval/tasks/anli.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
class
ANLIBase
(
HFTask
):
class
ANLIBase
(
HFTask
):
...
...
lm_eval/tasks/arc.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
...
...
lm_eval/tasks/arithmetic.py
View file @
1fb90b91
...
@@ -2,7 +2,8 @@ import abc
...
@@ -2,7 +2,8 @@ import abc
import
json
import
json
import
os
import
os
from
collections
import
namedtuple
from
collections
import
namedtuple
from
lm_eval.base
import
Task
,
mean
,
rf
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
from
best_download
import
download_file
from
best_download
import
download_file
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
...
...
lm_eval/tasks/common.py
View file @
1fb90b91
import
datasets
import
datasets
import
numpy
as
np
import
numpy
as
np
import
lm_eval.metrics
from
..base
import
Task
from
..base
import
Task
...
@@ -44,7 +46,7 @@ class HFTask(Task):
...
@@ -44,7 +46,7 @@ class HFTask(Task):
def
simple_accuracy_metric
(
preds
,
golds
):
def
simple_accuracy_metric
(
preds
,
golds
):
acc
=
float
(
(
np
.
array
(
preds
)
==
np
.
array
(
golds
))
.
mean
())
acc
=
float
(
lm_eval
.
metrics
.
mean
())
return
{
return
{
"major"
:
acc
,
"major"
:
acc
,
"minor"
:
{
"acc"
:
acc
},
"minor"
:
{
"acc"
:
acc
},
...
...
lm_eval/tasks/glue.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
,
f1_score
,
matthews_corrcoef
from
lm_eval.base
import
rf
from
..metrics
import
mean
,
matthews_corrcoef
,
f1_score
from
scipy.stats
import
pearsonr
,
spearmanr
from
scipy.stats
import
pearsonr
,
spearmanr
from
tqdm
import
auto
as
tqdm_lib
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
yesno
from
.
common
import
HFTask
,
yesno
...
...
lm_eval/tasks/lambada.py
View file @
1fb90b91
from
lm_eval.base
import
Task
,
rf
,
mean
,
perplexity
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
,
perplexity
from
lm_eval.utils
import
sh
from
lm_eval.utils
import
sh
import
json
import
json
import
math
import
math
...
...
lm_eval/tasks/piqa.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
...
...
lm_eval/tasks/pubmedqa.py
View file @
1fb90b91
...
@@ -2,7 +2,8 @@ import numpy as np
...
@@ -2,7 +2,8 @@ import numpy as np
import
json
import
json
import
random
import
random
from
.common
import
HFTask
from
.common
import
HFTask
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
class
Pubmed_QA
(
HFTask
):
class
Pubmed_QA
(
HFTask
):
...
...
lm_eval/tasks/qa4mre.py
View file @
1fb90b91
import
os
import
os
import
numpy
as
np
import
numpy
as
np
from
best_download
import
download_file
from
best_download
import
download_file
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
lm_eval.metrics
import
mean
import
xml.etree.ElementTree
as
ET
import
xml.etree.ElementTree
as
ET
import
random
import
random
...
...
lm_eval/tasks/race.py
View file @
1fb90b91
import
collections
import
collections
import
datasets
import
datasets
import
numpy
as
np
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
import
os
import
os
...
...
lm_eval/tasks/sat.py
View file @
1fb90b91
import
json
import
json
import
random
import
random
import
os
import
os
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
..metrics
import
mean
from
tqdm
import
auto
as
tqdm_lib
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
simple_accuracy_metric
from
.
common
import
simple_accuracy_metric
import
numpy
as
np
import
numpy
as
np
...
...
lm_eval/tasks/sciq.py
View file @
1fb90b91
import
os
import
os
import
json
import
json
from
..utils
import
sh
from
..utils
import
sh
from
lm_eval.base
import
MultipleChoiceTask
,
rf
,
mean
from
lm_eval.base
import
MultipleChoiceTask
,
rf
from
..metrics
import
mean
import
zipfile
import
zipfile
from
best_download
import
download_file
from
best_download
import
download_file
...
...
lm_eval/tasks/superglue.py
View file @
1fb90b91
...
@@ -5,7 +5,8 @@ To-do:
...
@@ -5,7 +5,8 @@ To-do:
"""
"""
import
numpy
as
np
import
numpy
as
np
from
.
common
import
HFTask
,
yesno
from
.
common
import
HFTask
,
yesno
from
lm_eval.base
import
rf
,
mean
,
acc_all
,
metric_max_over_ground_truths
from
lm_eval.base
import
rf
from
..metrics
import
mean
,
acc_all
,
metric_max_over_ground_truths
import
sklearn
import
sklearn
import
transformers.data.metrics.squad_metrics
as
squad_metrics
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
..utils
import
general_detokenize
from
..utils
import
general_detokenize
...
...
lm_eval/tasks/triviaqa.py
View file @
1fb90b91
import
os
import
os
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Task
,
mean
,
rf
from
lm_eval.base
import
Task
,
rf
from
..metrics
import
mean
from
..utils
import
sh
from
..utils
import
sh
class
TriviaQA
(
Task
):
class
TriviaQA
(
Task
):
...
...
lm_eval/tasks/webqs.py
View file @
1fb90b91
from
.
common
import
HFTask
from
.
common
import
HFTask
from
lm_eval.base
import
mean
,
rf
from
lm_eval.base
import
rf
from
..metrics
import
mean
class
WebQs
(
HFTask
):
class
WebQs
(
HFTask
):
DATASET_PATH
=
"web_questions"
DATASET_PATH
=
"web_questions"
...
...
lm_eval/tasks/winogrande.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
from
.
common
import
HFTask
from
.
common
import
HFTask
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
"""
"""
This evaluation of Winogrande uses partial evaluation as described by
This evaluation of Winogrande uses partial evaluation as described by
...
...
lm_eval/tasks/wsc273.py
View file @
1fb90b91
import
numpy
as
np
import
numpy
as
np
import
random
import
random
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
from
..metrics
import
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment