Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
5aa601f3
Commit
5aa601f3
authored
Mar 28, 2021
by
Leo Gao
Browse files
Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness
parents
42659c34
f984c88e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
73 additions
and
20 deletions
+73
-20
lm_eval/evaluator.py
lm_eval/evaluator.py
+1
-2
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+1
-1
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+70
-16
tests/test_evaluator.py
tests/test_evaluator.py
+1
-1
No files found.
lm_eval/evaluator.py
View file @
5aa601f3
...
...
@@ -48,7 +48,6 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
reqs
=
task
.
construct_requests
(
doc
,
ctx
)
if
not
isinstance
(
reqs
,
(
list
,
tuple
)):
reqs
=
[
reqs
]
for
i
,
req
in
enumerate
(
reqs
):
requests
[
req
.
type
].
append
(
req
)
# i: index in requests for a single task instance
...
...
@@ -90,4 +89,4 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
task
=
task_dict
[
task_name
]
results
[
task_name
][
metric
]
=
task
.
aggregation
()[
metric
](
items
)
return
results
\ No newline at end of file
return
results
lm_eval/tasks/__init__.py
View file @
5aa601f3
...
...
@@ -109,7 +109,7 @@ TASK_REGISTRY = {
"hellaswag"
:
hellaswag
.
HellaSwag
,
# not implemented yet
"openbookqa"
:
openbookqa
.
OpenBookQA
,
# "sat": sat.SATAnalogies, # not implemented yet
#
"squad": squad.SQuAD,
# not implemented yet
"squad
2
"
:
squad
.
SQuAD
2
,
"race"
:
race
.
RACE
,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa"
:
headqa
.
HeadQA
,
...
...
lm_eval/tasks/squad.py
View file @
5aa601f3
import
datasets
from
math
import
exp
from
lm_eval.base
import
rf
from
lm_eval.metrics
import
f1_score
,
mean
from
.
common
import
HFTask
from
functools
import
partial
class
SQuAD
(
HFTask
):
def
_squad_metric
(
predictions
,
references
):
squad_metric
=
datasets
.
load_metric
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
def
_squad_agg
(
key
,
items
):
predictions
,
references
=
zip
(
*
items
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
key
]
class
SQuAD2
(
HFTask
):
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
...
...
@@ -15,16 +31,14 @@ class SQuAD(HFTask):
return
False
def
training_docs
(
self
):
if
self
.
has_training_docs
():
return
self
.
data
[
"train"
]
return
self
.
data
[
"train"
]
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
return
self
.
data
[
"validation"
]
return
self
.
data
[
"validation"
]
def
fewshot_description
(
self
):
# TODO:
redo
description
return
"
Title: The_Title_of_It
\n\n
Background: A text passage as background to answer the question with.
\n\n
Q: Question about the passage.
\n\n
A: Answer.
"
# TODO:
figure out
description
return
""
def
doc_to_text
(
self
,
doc
):
return
'Title: '
+
doc
[
'title'
]
+
'
\n\n
'
+
'Background: '
+
doc
[
'context'
]
+
'
\n\n
'
+
'Question: '
+
doc
[
'question'
]
+
'
\n\n
'
+
'Answer:'
...
...
@@ -35,7 +49,7 @@ class SQuAD(HFTask):
answer
=
answer_list
[
0
]
else
:
answer
=
'unanswerable'
return
answer
return
" "
+
answer
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
...
...
@@ -48,8 +62,9 @@ class SQuAD(HFTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
continuation
=
rf
.
greedy_until
(
ctx
,
[
'
\n
'
])
is_unanswerable
=
rf
.
loglikelihood
(
ctx
,
" "
+
"unanswerable"
)
return
continuation
,
is_unanswerable
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -61,8 +76,31 @@ class SQuAD(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
continuation
,
(
logprob_unanswerable
,
_
)
=
results
no_answer_probability
=
exp
(
logprob_unanswerable
)
predictions
=
{
'id'
:
doc
[
'id'
],
'prediction_text'
:
continuation
,
'no_answer_probability'
:
no_answer_probability
,
}
references
=
{
'id'
:
doc
[
'id'
],
'answers'
:
doc
[
'answers'
],
}
return
{
'exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'HasAns_exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'NoAns_exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'best_exact'
:
(
predictions
,
references
),
# Best exact match (with varying threshold)
'best_f1'
:
(
predictions
,
references
),
# Best F1 (with varying threshold)
}
def
aggregation
(
self
):
"""
...
...
@@ -70,8 +108,16 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
'exact'
:
partial
(
_squad_agg
,
'exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
partial
(
_squad_agg
,
'f1'
),
# The F-score of predicted tokens versus the gold answer
'HasAns_exact'
:
partial
(
_squad_agg
,
'HasAns_exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
partial
(
_squad_agg
,
'HasAns_f1'
),
# The F-score of predicted tokens versus the gold answer
'NoAns_exact'
:
partial
(
_squad_agg
,
'NoAns_exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
partial
(
_squad_agg
,
'NoAns_f1'
),
# The F-score of predicted tokens versus the gold answer
'best_exact'
:
partial
(
_squad_agg
,
'best_exact'
),
# Best exact match (with varying threshold)
'best_f1'
:
partial
(
_squad_agg
,
'best_f1'
),
# Best F1 (with varying threshold)
}
def
higher_is_better
(
self
):
"""
...
...
@@ -79,5 +125,13 @@ class SQuAD(HFTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
'exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'HasAns_exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'NoAns_exact'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
True
,
# The F-score of predicted tokens versus the gold answer
'best_exact'
:
True
,
# Best exact match (with varying threshold)
'best_f1'
:
True
,
# Best F1 (with varying threshold)
}
tests/test_evaluator.py
View file @
5aa601f3
...
...
@@ -29,4 +29,4 @@ def test_evaluator(taskname, Task):
lm
.
loglikelihood
=
ll_fn
evaluator
.
evaluate
(
lm
,
task_dict
,
False
,
0
,
10
)
\ No newline at end of file
evaluator
.
evaluate
(
lm
,
task_dict
,
False
,
0
,
10
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment