Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
fc7cd630
Commit
fc7cd630
authored
Feb 20, 2021
by
Jon Tow
Browse files
Implement `DROP` evaluation
parent
f3bf1c07
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
196 additions
and
65 deletions
+196
-65
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+2
-0
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+194
-65
No files found.
lm_eval/tasks/__init__.py
View file @
fc7cd630
...
...
@@ -29,6 +29,7 @@ from . import qa4mre
from
.
import
translation
from
.
import
headqa
from
.
import
mathqa
from
.
import
drop
########################################
# Translation tasks
...
...
@@ -83,6 +84,7 @@ TASK_REGISTRY = {
# Order by benchmark/genre?
"coqa"
:
coqa
.
CoQA
,
"drop"
:
drop
.
DROP
,
"lambada"
:
lambada
.
LAMBADA
,
"piqa"
:
piqa
.
PiQA
,
...
...
lm_eval/tasks/drop.py
View file @
fc7cd630
import
numpy
as
np
import
json
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
f1_score
,
matthews_corrcoef
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
import
numpy
as
np
import
re
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
best_download
import
download_file
from
scipy.optimize
import
linear_sum_assignment
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
from
pathlib
import
Path
from
..base
import
Task
from
zipfile
import
ZipFile
class
DROP
(
Task
):
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
def
__init__
(
self
):
super
().
__init__
()
DATAFOLDER
=
Path
(
"data/drop"
)
URL
=
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def
download
(
self
):
if
self
.
DATAFOLDER
.
exists
():
return
Path
.
mkdir
(
self
.
DATAFOLDER
)
download_file
(
self
.
URL
,
to
=
str
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
))
with
ZipFile
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
,
"r"
)
as
zip
:
zip
.
extractall
(
self
.
DATAFOLDER
)
def
has_training_docs
(
self
):
"""Whether the task has a training set"""
return
True
def
has_validation_docs
(
self
):
"""Whether the task has a validation set"""
return
True
def
has_test_docs
(
self
):
"""Whether the task has a test set"""
return
False
def
training_docs
(
self
):
docs
=
json
.
load
(
open
(
self
.
DATAFOLDER
/
'drop_dataset_train.json'
))
return
[
docs
[
k
]
for
k
in
docs
.
keys
()]
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
_load_docs
(
self
,
docs
):
for
doc
in
docs
:
for
qa
in
doc
[
"qa_pairs"
]:
yield
{
"passage"
:
doc
[
"passage"
],
"question"
:
qa
[
"question"
],
"answers"
:
self
.
get_answers
(
qa
[
"answer"
]),
}
@
classmethod
def
get_answers
(
cls
,
answers
):
# NOTE: We wrap every non-`list` answer into a list for uniformity.
if
answers
[
"number"
]
!=
""
:
return
[
answers
[
"number"
]]
if
answers
[
"spans"
]
!=
[]:
return
answers
[
"spans"
]
return
[
" "
.
join
([
answers
[
"date"
][
"day"
],
answers
[
"date"
][
"month"
],
answers
[
"date"
][
"year"
]]).
strip
()]
def
training_docs
(
self
):
docs
=
json
.
load
(
open
(
self
.
DATAFOLDER
/
"drop_dataset"
/
"drop_dataset_train.json"
))
return
self
.
_load_docs
([
docs
[
k
]
for
k
in
docs
.
keys
()])
def
validation_docs
(
self
):
docs
=
json
.
load
(
open
(
self
.
DATAFOLDER
/
'
drop_dataset_dev.json
'
))
return
[
docs
[
k
]
for
k
in
docs
.
keys
()]
docs
=
json
.
load
(
open
(
self
.
DATAFOLDER
/
"drop_dataset"
/
"
drop_dataset_dev.json
"
))
return
self
.
_load_docs
(
[
docs
[
k
]
for
k
in
docs
.
keys
()]
)
def
test_docs
(
self
):
pass
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
doctext
=
"Passage: {}
\n
"
.
format
(
doc
[
"passage"
])
qa_texts
=
[]
for
pair
in
doc
[
"qa_pairs"
]:
text
=
''
.
join
([
'Question: '
,
pair
[
'question'
],
'
\n
Answer: '
])
if
include_target
:
def
get_answer
(
ans_dict
):
if
ans_dict
[
'number'
]
!=
''
:
return
ans_dict
[
'number'
]
if
ans_dict
[
'spans'
]
!=
[]:
if
len
(
ans_dict
[
'spans'
])
>
0
:
return
', '
.
join
(
ans_dict
[
'spans'
])
return
ans_dict
[
'spans'
][
0
]
return
' '
.
join
([
ans_dict
[
'date'
][
'day'
],
ans_dict
[
'date'
][
'month'
],
ans_dict
[
'date'
][
'year'
]]).
strip
()
text
=
''
.
join
([
text
,
get_answer
(
pair
[
'answer'
])])
qa_texts
.
append
(
text
)
return
''
.
join
([
doctext
,
'
\n
'
.
join
(
qa_texts
)])
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
f
"Passage:
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
def
doc_to_target
(
self
,
doc
):
return
" "
+
", "
.
join
(
doc
[
"answers"
])
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
conts
=
[]
for
_
in
doc
[
"answers"
]:
conts
.
append
(
rf
.
greedy_until
(
ctx
,
[
"
\n
"
,
"."
]))
return
conts
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param
doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
gold
,
pred
=
doc
[
"answers"
],
results
print
(
gold
)
print
(
pred
)
exact_match
=
self
.
_exact_match
(
gold
,
pred
)
f1_score
=
self
.
_f1_score
(
gold
,
pred
)
return
{
"em"
:
exact_match
,
"f1"
:
f1_score
}
def
_exact_match
(
self
,
golds
,
preds
):
""" Returns the exact match of normalized gold answers and predictions. """
normalized_golds
=
set
([
self
.
_normalize
(
gold
)
for
gold
in
golds
])
normalized_preds
=
set
([
self
.
_normalize
(
pred
)
for
pred
in
preds
])
return
int
(
normalized_golds
==
normalized_preds
)
def
_f1_score
(
self
,
golds
,
preds
):
"""Returns the average F1-score over normalized `gold` and `pred`
answer lists.
"""
gold_bags
=
self
.
_answer_to_bags
(
golds
)
print
(
"GOLD BAGS: "
+
str
(
gold_bags
))
pred_bags
=
self
.
_answer_to_bags
(
preds
)
print
(
"PRED BAGS: "
+
str
(
pred_bags
))
f1_per_bag
=
self
.
_align_bags
(
gold_bags
,
pred_bags
)
return
np
.
mean
(
f1_per_bag
)
def
_answer_to_bags
(
self
,
answers
):
return
[
set
(
self
.
_normalize
(
answer
).
split
())
for
answer
in
answers
]
def
_align_bags
(
self
,
gold_bags
,
pred_bags
):
""" Returns the max metric value over all the answers. """
scores
=
np
.
zeros
([
len
(
gold_bags
),
len
(
pred_bags
)])
for
gold_index
,
gold_bag
in
enumerate
(
gold_bags
):
for
pred_index
,
pred_bag
in
enumerate
(
pred_bags
):
print
(
self
.
_is_number_match
(
gold_bag
,
pred_bag
))
if
self
.
_is_number_match
(
gold_bag
,
pred_bag
):
scores
[
gold_index
,
pred_index
]
=
self
.
_bag_f1
(
pred_bag
,
gold_bag
)
print
(
scores
)
row_ind
,
col_ind
=
linear_sum_assignment
(
-
scores
)
max_scores
=
np
.
zeros
([
max
(
len
(
gold_bags
),
len
(
pred_bags
))])
for
row
,
column
in
zip
(
row_ind
,
col_ind
):
max_scores
[
row
]
=
max
(
max_scores
[
row
],
scores
[
row
,
column
])
return
max_scores
def
_bag_f1
(
self
,
gold_bag
,
pred_bag
):
intersection
=
len
(
gold_bag
.
intersection
(
pred_bag
))
if
intersection
==
0
:
return
0.0
precision
=
intersection
/
float
(
len
(
pred_bag
))
if
pred_bag
else
1.0
recall
=
intersection
/
float
(
len
(
gold_bag
))
if
gold_bag
else
1.0
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
def
_is_number_match
(
self
,
gold_bag
,
pred_bag
):
gold_numbers
=
set
(
filter
(
lambda
s
:
s
.
isnumeric
(),
list
(
gold_bag
)))
pred_numbers
=
set
(
filter
(
lambda
s
:
s
.
isnumeric
(),
list
(
pred_bag
)))
return
(
not
gold_numbers
)
or
gold_numbers
.
intersection
(
pred_numbers
)
def
_normalize
(
self
,
answer
):
def
tokenize
(
text
):
return
re
.
split
(
" |-"
,
text
)
tokens
=
[
squad_metrics
.
normalize_answer
(
token
)
for
token
in
tokenize
(
answer
)]
normalized
=
" "
.
join
(
tokens
).
strip
()
return
normalized
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
"em"
:
mean
,
"f1"
:
mean
}
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
"em"
:
True
,
"f1"
:
True
}
# Temporary sanity-checks
def
main
():
drop
=
DROP
()
def
test_bags
():
multiple_answers
=
[
"Pacific Ocean"
,
"Pacific"
]
ma_bags
=
drop
.
_answer_to_bags
(
multiple_answers
)
print
(
f
"Multiple Choice Answer Bags:
{
multiple_answers
}
=>
{
ma_bags
}
"
)
assert
len
(
ma_bags
)
==
2
number_answer
=
[
"1974"
]
number_bags
=
drop
.
_answer_to_bags
(
number_answer
)
print
(
f
"Number Bags:
{
number_answer
}
=>
{
number_bags
}
"
)
print
()
test_bags
()
def
test_is_number_match
():
gold
=
[
"10 29 1999"
]
pred
=
[
"4 29 1990"
]
gb
=
drop
.
_answer_to_bags
(
gold
)
pb
=
drop
.
_answer_to_bags
(
pred
)
print
(
gb
)
print
(
pb
)
for
g
in
gb
:
for
p
in
pb
:
match
=
drop
.
_is_number_match
(
g
,
p
)
print
(
match
)
print
()
#test_is_number_match()
def
test_exact_match
():
gold
=
[
"Bob Ross"
]
pred
=
[
"Bob Ross"
]
em
=
drop
.
_exact_match
(
gold
,
pred
)
print
(
em
)
#test_exact_match()
def
test_f1_score
():
gold
=
[
"25 to 44"
]
pred
=
[
"25 to 44 or 45 to 64"
]
f1
=
drop
.
_f1_score
(
gold
,
pred
)
print
(
gold
)
print
(
pred
)
print
(
f1
)
gold
=
[
"300"
,
"1992"
]
pred
=
[
"300"
,
"1992"
]
f1
=
drop
.
_f1_score
(
gold
,
pred
)
print
(
f1
)
#test_f1_score()
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment