Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e4d852a0
Commit
e4d852a0
authored
Feb 21, 2021
by
Jon Tow
Browse files
Clean up
parent
f4f7618a
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
77 deletions
+23
-77
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+23
-77
No files found.
lm_eval/tasks/drop.py
View file @
e4d852a0
...
...
@@ -15,8 +15,7 @@ class DROP(Task):
URL
=
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def
download
(
self
):
if
self
.
DATAFOLDER
.
exists
():
return
if
self
.
DATAFOLDER
.
exists
():
return
Path
.
mkdir
(
self
.
DATAFOLDER
)
download_file
(
self
.
URL
,
to
=
str
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
))
with
ZipFile
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
,
"r"
)
as
zip
:
...
...
@@ -39,6 +38,7 @@ class DROP(Task):
for
doc
in
docs
:
for
qa
in
doc
[
"qa_pairs"
]:
yield
{
"id"
:
qa
[
"query_id"
],
"passage"
:
doc
[
"passage"
],
"question"
:
qa
[
"question"
],
"answers"
:
self
.
get_answers
(
qa
[
"answer"
]),
...
...
@@ -48,7 +48,7 @@ class DROP(Task):
def
get_answers
(
cls
,
answers
):
# NOTE: We wrap every non-`list` answer into a list for uniformity.
if
answers
[
"number"
]
!=
""
:
return
[
answers
[
"number"
]]
return
[
str
(
answers
[
"number"
]
)
]
if
answers
[
"spans"
]
!=
[]:
return
answers
[
"spans"
]
return
[
" "
.
join
([
answers
[
"date"
][
"day"
],
...
...
@@ -85,7 +85,7 @@ class DROP(Task):
"""
conts
=
[]
for
_
in
doc
[
"answers"
]:
conts
.
append
(
rf
.
greedy_until
(
ctx
,
[
"
\n
"
,
"."
]))
conts
.
append
(
rf
.
greedy_until
(
ctx
,
[
"."
]))
return
conts
def
process_results
(
self
,
doc
,
results
):
...
...
@@ -98,12 +98,13 @@ class DROP(Task):
:param results:
The results of the requests created in construct_requests.
"""
gold
,
pred
=
doc
[
"answers"
],
results
print
(
gold
)
print
(
pred
)
exact_match
=
self
.
_exact_match
(
gold
,
pred
)
f1_score
=
self
.
_f1_score
(
gold
,
pred
)
return
{
"em"
:
exact_match
,
"f1"
:
f1_score
}
golds
,
preds
=
doc
[
"answers"
],
results
exact_match
=
self
.
_exact_match
(
golds
,
preds
)
f1_score
=
self
.
_f1_score
(
golds
,
preds
)
return
{
"em"
:
exact_match
,
"f1"
:
f1_score
}
def
_exact_match
(
self
,
golds
,
preds
):
""" Returns the exact match of normalized gold answers and predictions. """
...
...
@@ -112,13 +113,9 @@ class DROP(Task):
return
int
(
normalized_golds
==
normalized_preds
)
def
_f1_score
(
self
,
golds
,
preds
):
"""Returns the average F1-score over normalized `gold` and `pred`
answer lists.
"""
"""Returns the average F1-score over normalized gold answers and predictions. """
gold_bags
=
self
.
_answer_to_bags
(
golds
)
print
(
"GOLD BAGS: "
+
str
(
gold_bags
))
pred_bags
=
self
.
_answer_to_bags
(
preds
)
print
(
"PRED BAGS: "
+
str
(
pred_bags
))
f1_per_bag
=
self
.
_align_bags
(
gold_bags
,
pred_bags
)
return
np
.
mean
(
f1_per_bag
)
...
...
@@ -133,7 +130,6 @@ class DROP(Task):
print
(
self
.
_is_number_match
(
gold_bag
,
pred_bag
))
if
self
.
_is_number_match
(
gold_bag
,
pred_bag
):
scores
[
gold_index
,
pred_index
]
=
self
.
_bag_f1
(
gold_bag
,
pred_bag
)
print
(
scores
)
row_ind
,
col_ind
=
linear_sum_assignment
(
-
scores
)
max_scores
=
np
.
zeros
([
max
(
len
(
gold_bags
),
len
(
pred_bags
))])
for
row
,
column
in
zip
(
row_ind
,
col_ind
):
...
...
@@ -169,7 +165,10 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"em"
:
mean
,
"f1"
:
mean
}
return
{
"em"
:
mean
,
"f1"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -178,60 +177,7 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"em"
:
True
,
"f1"
:
True
}
# Temporary sanity-checks
def
main
():
drop
=
DROP
()
def
test_bags
():
multiple_answers
=
[
"Pacific Ocean"
,
"Pacific"
]
ma_bags
=
drop
.
_answer_to_bags
(
multiple_answers
)
print
(
f
"Multiple Choice Answer Bags:
{
multiple_answers
}
=>
{
ma_bags
}
"
)
assert
len
(
ma_bags
)
==
2
number_answer
=
[
"1974"
]
number_bags
=
drop
.
_answer_to_bags
(
number_answer
)
print
(
f
"Number Bags:
{
number_answer
}
=>
{
number_bags
}
"
)
print
()
test_bags
()
def
test_is_number_match
():
gold
=
[
"10 29 1999"
]
pred
=
[
"4 29 1990"
]
gb
=
drop
.
_answer_to_bags
(
gold
)
pb
=
drop
.
_answer_to_bags
(
pred
)
print
(
gb
)
print
(
pb
)
for
g
in
gb
:
for
p
in
pb
:
match
=
drop
.
_is_number_match
(
g
,
p
)
print
(
match
)
print
()
#test_is_number_match()
def
test_exact_match
():
gold
=
[
"Bob Ross"
]
pred
=
[
"Bob Ross"
]
em
=
drop
.
_exact_match
(
gold
,
pred
)
print
(
em
)
#test_exact_match()
def
test_f1_score
():
gold
=
[
"25 to 44"
]
pred
=
[
"25 to 44 or 45 to 64"
]
f1
=
drop
.
_f1_score
(
gold
,
pred
)
print
(
gold
)
print
(
pred
)
print
(
f1
)
gold
=
[
"300"
,
"1992"
]
pred
=
[
"300"
,
"1992"
]
f1
=
drop
.
_f1_score
(
gold
,
pred
)
print
(
f1
)
#test_f1_score()
if
__name__
==
"__main__"
:
main
()
return
{
"em"
:
True
,
"f1"
:
True
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment