Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3c37ea9c
Unverified
Commit
3c37ea9c
authored
Feb 21, 2022
by
Leo Gao
Committed by
GitHub
Feb 21, 2022
Browse files
Merge pull request #264 from StephenHogg/qasper
Add QASPER task
parents
89905174
815f165c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
220 additions
and
0 deletions
+220
-0
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+3
-0
lm_eval/tasks/qasper.py
lm_eval/tasks/qasper.py
+217
-0
No files found.
lm_eval/tasks/__init__.py
View file @
3c37ea9c
...
...
@@ -29,6 +29,7 @@ from . import triviaqa
from
.
import
pubmedqa
from
.
import
sciq
from
.
import
webqs
from
.
import
qasper
from
.
import
qa4mre
from
.
import
translation
from
.
import
headqa
...
...
@@ -122,6 +123,8 @@ TASK_REGISTRY = {
"pubmedqa"
:
pubmedqa
.
Pubmed_QA
,
"sciq"
:
sciq
.
SciQ
,
"qasper"
:
qasper
.
QASPER
,
"qa4mre_2011"
:
qa4mre
.
QA4MRE_2011
,
"qa4mre_2012"
:
qa4mre
.
QA4MRE_2012
,
"qa4mre_2013"
:
qa4mre
.
QA4MRE_2013
,
...
...
lm_eval/tasks/qasper.py
0 → 100644
View file @
3c37ea9c
"""
A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
@article{DBLP:journals/corr/abs-2105-03011,
author = {Pradeep Dasigi and
Kyle Lo and
Iz Beltagy and
Arman Cohan and
Noah A. Smith and
Matt Gardner},
title = {A Dataset of Information-Seeking Questions and Answers Anchored in
Research Papers},
journal = {CoRR},
volume = {abs/2105.03011},
year = {2021},
url = {https://arxiv.org/abs/2105.03011},
eprinttype = {arXiv},
eprint = {2105.03011},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
from
collections
import
Counter
from
math
import
exp
import
random
import
re
import
string
from
lm_eval.base
import
rf
from
lm_eval.metrics
import
f1_score
,
mean
from
.common
import
HFTask
def
normalize_answer
(
s
):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
Lower text and remove punctuation, articles and extra whitespace.
"""
def
remove_articles
(
text
):
return
re
.
sub
(
r
"\b(a|an|the)\b"
,
" "
,
text
)
def
white_space_fix
(
text
):
return
" "
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
categorise_answer
(
answer_blob
):
if
answer_blob
[
"unanswerable"
]:
answer
=
"unanswerable"
answer_type
=
"unanswerable"
return
answer
,
answer_type
elif
answer_blob
[
"yes_no"
]:
answer
=
"yes"
answer_type
=
"bool"
return
answer
,
answer_type
elif
answer_blob
[
"free_form_answer"
]:
answer
=
answer_blob
[
"free_form_answer"
]
answer_type
=
"free form answer"
return
answer
,
answer_type
elif
answer_blob
[
"extractive_spans"
]:
answer
=
answer_blob
[
"extractive_spans"
]
answer_type
=
"extractive_spans"
return
answer
,
answer_type
elif
answer_blob
[
"yes_no"
]
is
False
:
answer
=
"no"
answer_type
=
"bool"
return
answer
,
answer_type
def
token_f1_score
(
prediction
,
ground_truth
):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
"""
prediction_tokens
=
normalize_answer
(
prediction
).
split
()
ground_truth_tokens
=
normalize_answer
(
ground_truth
).
split
()
common
=
Counter
(
prediction_tokens
)
&
Counter
(
ground_truth_tokens
)
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
prediction_tokens
)
recall
=
1.0
*
num_same
/
len
(
ground_truth_tokens
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
class
QASPER
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"qasper"
DATASET_NAME
=
None
def
doc_to_text
(
self
,
doc
):
return
(
"TITLE: "
+
doc
[
"title"
]
+
"
\n
"
+
"ABSTRACT: "
+
doc
[
"abstract"
]
+
"
\n\n
"
+
"Q: "
+
doc
[
"question"
]
+
"
\n\n
"
+
"A:"
)
def
doc_to_target
(
self
,
doc
):
answer
=
doc
[
"answer"
]
if
isinstance
(
answer
,
list
):
answer
=
", "
.
join
(
answer
)
return
" "
+
answer
def
training_docs
(
self
):
for
doc
in
self
.
data
[
"train"
]:
yield
from
self
.
process_doc
(
doc
)
def
validation_docs
(
self
):
for
doc
in
self
.
data
[
"train"
]:
yield
from
self
.
process_doc
(
doc
)
def
process_doc
(
self
,
doc
):
"""Given a `doc`, flatten it out so that each JSON blob
contains exactly one question and one answer. Logic taken from
the reference implementation available at
https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
"""
obs_list
=
[]
for
question
,
answer_list
in
zip
(
doc
[
"qas"
][
"question"
],
doc
[
"qas"
][
"answers"
]):
for
answer_blob
in
answer_list
[
"answer"
]:
answer
,
answer_type
=
categorise_answer
(
answer_blob
)
obs_list
.
append
(
{
"title"
:
doc
[
"title"
],
"abstract"
:
doc
[
"abstract"
],
"question"
:
question
,
"answer"
:
answer
,
"answer_type"
:
answer_type
,
}
)
return
obs_list
def
process_results
(
self
,
doc
,
results
):
# TODO: Calculate a score for extractive spans once a request type for generating
# extractive spans is available
if
not
results
:
return
{}
elif
len
(
results
)
==
1
:
[
res
]
=
results
elif
len
(
results
)
==
2
:
[
ll_yes
,
ll_no
]
=
results
# TODO: Handle unanswerability first
# unanswerable_gold = doc["answer_type"] == "unanswerable"
# unanswerable_pred = exp(logprob_unanswerable)
# res_dict["f1_unanswerable"] = (unanswerable_gold, unanswerable_pred)
res_dict
=
{}
# Handle yes/no questions
if
doc
[
"answer_type"
]
==
"bool"
:
gold
=
1
if
doc
[
"answer"
]
==
"yes"
else
0
pred
=
ll_yes
>
ll_no
res_dict
[
"f1_yesno"
]
=
(
gold
,
pred
)
# Handle completions
if
doc
[
"answer_type"
]
==
"free form answer"
:
res_dict
[
"f1_abstractive"
]
=
token_f1_score
(
res
,
doc
[
"answer"
])
# TODO: Handle extraction
# if doc["answer_type"] == "extractive_spans":
# res_dict["f1_extractive"] = 0
return
res_dict
def
aggregation
(
self
):
return
{
"f1_yesno"
:
f1_score
,
"f1_abstractive"
:
mean
,
}
def
construct_requests
(
self
,
doc
,
ctx
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
if
doc
[
"answer_type"
]
in
(
"free form answer"
):
return
[
rf
.
greedy_until
(
ctx
,
[
"
\n
"
])]
elif
doc
[
"answer_type"
]
in
(
"bool"
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
" yes"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
" no"
)
return
[
ll_yes
,
ll_no
]
else
:
return
[]
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"f1_yesno"
:
True
,
"f1_abstractive"
:
True
,
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment