Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f48b119d
Commit
f48b119d
authored
Feb 09, 2021
by
Charles Foster
Browse files
Skeleton of SQuADv2. Not yet tested.
parent
3d432b1a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
55 additions
and
18 deletions
+55
-18
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+55
-18
No files found.
lm_eval/tasks/squad.py
View file @
f48b119d
import
numpy
as
np
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
f1_score
,
matthews_corrcoef
import
datasets
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
from
lm_eval.base
import
rf
,
f1_score
,
mean
from
.
common
import
HFTask
class
SQuAD
(
HFTask
):
DATASET_PATH
=
"squad_v2"
...
...
@@ -18,16 +18,14 @@ class SQuAD(HFTask):
return
False
def
training_docs
(
self
):
if
self
.
has_training_docs
():
return
self
.
data
[
"train"
]
return
self
.
data
[
"train"
]
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
return
self
.
data
[
"validation"
]
return
self
.
data
[
"validation"
]
def
fewshot_description
(
self
):
# TODO:
redo
description
return
"
Title: The_Title_of_It
\n\n
Background: A text passage as background to answer the question with.
\n\n
Q: Question about the passage.
\n\n
A: Answer.
"
# TODO:
figure out
description
return
""
def
doc_to_text
(
self
,
doc
):
return
'Title: '
+
doc
[
'title'
]
+
'
\n\n
'
+
'Background: '
+
doc
[
'context'
]
+
'
\n\n
'
+
'Q: '
+
doc
[
'question'
]
+
'
\n\n
'
+
'A: '
...
...
@@ -51,8 +49,8 @@ class SQuAD(HFTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
r
aise
NotImplementedError
(
'Evaluation not implemented'
)
continuation
=
rf
.
greedy_until
(
ctx
,
[
'
\n
'
])
r
eturn
continuation
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -64,8 +62,21 @@ class SQuAD(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
squad_metric
=
datasets
.
load_metric
(
"squad_v2"
)
predictions
=
{
'id'
:
doc
[
'id'
],
'prediction_text'
:
results
[
0
],
}
references
=
{
'id'
:
doc
[
'id'
],
'answers'
:
doc
[
'answers'
],
}
metrics
=
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
return
metrics
def
aggregation
(
self
):
"""
...
...
@@ -73,8 +84,21 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
'exact'
:
mean
,
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
mean
,
# The F-score of predicted tokens versus the gold answer
'total'
:
mean
,
# Number of score considered
'HasAns_exact'
:
mean
,
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
mean
,
# The F-score of predicted tokens versus the gold answer
'HasAns_total'
:
mean
,
# Number of score considered
'NoAns_exact'
:
mean
,
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
mean
,
# The F-score of predicted tokens versus the gold answer
'NoAns_total'
:
mean
,
# Number of score considered
'best_exact'
:
mean
,
# Best exact match (with varying threshold)
'best_exact_thresh'
:
mean
,
# No-answer probability threshold associated to the best exact match
'best_f1'
:
mean
,
# Best F1 (with varying threshold)
'best_f1_thresh'
:
mean
,
# No-answer probability threshold associated to the best F1
}
def
higher_is_better
(
self
):
"""
...
...
@@ -82,5 +106,18 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
'exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'total'
:
None
,
# Number of score considered
'HasAns_exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'HasAns_total'
:
None
,
# Number of score considered
'NoAns_exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'NoAns_total'
:
None
,
# Number of score considered
'best_exact'
:
True
,
# Best exact match (with varying threshold)
'best_exact_thresh'
:
None
,
# No-answer probability threshold associated to the best exact match
'best_f1'
:
True
,
# Best F1 (with varying threshold)
'best_f1_thresh'
:
None
,
# No-answer probability threshold associated to the best F1
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment