Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b5e86d3f
Unverified
Commit
b5e86d3f
authored
Feb 02, 2021
by
Jonathan Tow
Committed by
GitHub
Feb 02, 2021
Browse files
Merge branch 'master' into wsc273-evaluation
parents
c32a13e8
a1a4a32e
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
177 additions
and
172 deletions
+177
-172
lm_eval/base.py
lm_eval/base.py
+19
-13
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+2
-2
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+4
-5
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+2
-2
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+2
-2
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+2
-2
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+3
-3
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+2
-2
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+2
-2
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-2
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+2
-2
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+2
-2
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+133
-133
No files found.
lm_eval/base.py
View file @
b5e86d3f
...
...
@@ -58,10 +58,10 @@ class LM(abc.ABC):
return
cls
()
class
Dataset
(
abc
.
ABC
):
class
Task
(
abc
.
ABC
):
def
__init__
(
self
):
self
.
download
()
self
.
_traindocs
=
None
self
.
_train
ing_
docs
=
None
def
download
(
self
):
"""Downloads the task dataset if necessary"""
...
...
@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def
has_training_docs
(
self
):
"""Whether the task has a training set"""
pass
@
abc
.
abstractmethod
def
has_validation_docs
(
self
):
"""Whether the task has a validation set"""
...
...
@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def
training_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
validation_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
test_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
fewshot_examples
(
self
,
k
):
if
self
.
_traindocs
is
None
:
self
.
_traindocs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_traindocs
,
k
)
def
fewshot_examples
(
self
,
k
):
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_training_docs
,
k
)
@
abc
.
abstractmethod
def
doc_to_text
(
self
,
doc
):
...
...
@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
"""
pass
@
abc
.
abstractmethod
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
raw_description
=
self
.
fewshot_description
()
description
=
(
raw_description
+
"
\n
===
\n\n
"
)
if
provide_description
and
raw_description
else
""
if
num_fewshot
==
0
:
labeled_examples
=
""
else
:
...
...
lm_eval/tasks/arithmetic.py
View file @
b5e86d3f
...
...
@@ -2,12 +2,12 @@ import abc
import
json
import
os
from
collections
import
namedtuple
from
lm_eval.base
import
Dataset
,
mean
,
rf
from
lm_eval.base
import
Task
,
mean
,
rf
from
best_download
import
download_file
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
class
Arithmetic
(
Dataset
):
class
Arithmetic
(
Task
):
directory
=
'data/arithmetic/'
def
__init__
(
self
):
...
...
lm_eval/tasks/common.py
View file @
b5e86d3f
import
datasets
import
numpy
as
np
import
random
from
..base
import
Dataset
from
..base
import
Task
class
HFTask
(
Dataset
):
class
HFTask
(
Task
):
DATASET_PATH
=
None
DATASET_NAME
=
None
def
__init__
(
self
):
self
.
data
=
None
super
().
__init__
()
self
.
_training_docs
=
None
def
download
(
self
):
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
...
...
lm_eval/tasks/coqa.py
View file @
b5e86d3f
...
...
@@ -2,11 +2,11 @@
import
json
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
class
CoQA
(
Dataset
):
class
CoQA
(
Task
):
def
__init__
(
self
):
self
.
download
()
def
download
(
self
):
...
...
lm_eval/tasks/drop.py
View file @
b5e86d3f
...
...
@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
from
pathlib
import
Path
from
..base
import
Dataset
from
..base
import
Task
class
DROP
(
Dataset
):
class
DROP
(
Task
):
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
def
__init__
(
self
):
...
...
lm_eval/tasks/lambada.py
View file @
b5e86d3f
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
lm_eval.utils
import
sh
import
json
import
math
from
best_download
import
download_file
class
LAMBADA
(
Dataset
):
class
LAMBADA
(
Task
):
def
download
(
self
):
sh
(
"mkdir -p data/lambada"
)
download_file
(
...
...
lm_eval/tasks/naturalqs.py
View file @
b5e86d3f
...
...
@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
def
fewshot_examples
(
self
,
k
):
# Data is too large to fit in memory. We just sample from the first bit.
if
self
.
_traindocs
is
None
:
self
.
_traindocs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
if
self
.
_train
ing_
docs
is
None
:
self
.
_train
ing_
docs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
return
random
.
sample
(
self
.
_traindocs
,
k
)
return
random
.
sample
(
self
.
_train
ing_
docs
,
k
)
def
doc_to_text
(
self
,
doc
):
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A: '
...
...
lm_eval/tasks/piqa.py
View file @
b5e86d3f
import
json
import
random
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
..utils
import
sh
import
os
class
PiQA
(
Dataset
):
class
PiQA
(
Task
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/piqa'
):
#TODO: use best_download
...
...
lm_eval/tasks/quac.py
View file @
b5e86d3f
import
json
import
random
import
os
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
class
QuAC
(
Dataset
):
class
QuAC
(
Task
):
def
__init__
(
self
):
super
().
__init__
()
...
...
lm_eval/tasks/sat.py
View file @
b5e86d3f
import
json
import
random
import
os
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
simple_accuracy_metric
import
numpy
as
np
from
..utils
import
sh
class
SATAnalogies
(
Dataset
):
class
SATAnalogies
(
Task
):
NEEDS_MANUAL_DL
=
True
def
__init__
(
self
):
...
...
lm_eval/tasks/storycloze.py
View file @
b5e86d3f
import
json
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
import
csv
class
StoryCloze
(
Dataset
):
class
StoryCloze
(
Task
):
NEEDS_MANUAL_DL
=
True
def
download
(
self
):
...
...
lm_eval/tasks/triviaqa.py
View file @
b5e86d3f
import
os
import
json
import
random
from
lm_eval.base
import
Dataset
,
mean
,
rf
from
lm_eval.base
import
Task
,
mean
,
rf
from
..utils
import
sh
class
TriviaQA
(
Dataset
):
class
TriviaQA
(
Task
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
sh
(
"""
...
...
lm_eval/tasks/wsc273.py
View file @
b5e86d3f
import
numpy
as
np
import
random
from
lm_eval.base
import
rf
,
mean
from
.
common
import
HFTask
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class
WinogradSchemaChallenge273
(
HFTask
):
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
def
__init__
(
self
):
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
def
__clean_data
(
self
):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data
=
[]
for
doc
in
self
.
data
[
"test"
]:
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"options"
][
0
]
=
self
.
__normalize_option
(
doc
[
"options"
][
0
],
doc
)
doc
[
"options"
][
1
]
=
self
.
__normalize_option
(
doc
[
"options"
][
1
],
doc
)
data
.
append
(
doc
)
return
{
"test"
:
data
}
def
__normalize_option
(
self
,
option
,
doc
):
# Append `'s` to possessive determiner based options.
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
'pronoun_loc'
]
-
2
]
==
'.'
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
def
has_training_docs
(
self
):
return
False
def
has_validation_docs
(
self
):
return
False
def
has_test_docs
(
self
):
return
True
def
fewshot_examples
(
self
,
k
):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return
random
.
sample
(
list
(
self
.
test_docs
()),
k
)
def
fewshot_description
(
self
):
# TODO: redo description
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@
classmethod
def
partial_context
(
cls
,
doc
):
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
context1
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
0
]
context2
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
1
]
return
context1
,
context2
@
classmethod
def
partial_target
(
cls
,
doc
):
# The target is everything after the document specified pronoun.
start_index
=
doc
[
"pronoun_loc"
]
+
len
(
doc
[
"pronoun"
])
return
doc
[
"text"
][
start_index
:].
strip
()
def
doc_to_text
(
self
,
doc
):
context1
,
context2
=
self
.
partial_context
(
doc
)
return
context1
+
'
\n
'
+
context2
+
'
\n
'
def
doc_to_target
(
self
,
doc
):
return
self
.
partial_target
(
doc
)
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target
=
self
.
partial_target
(
doc
)
context1
,
context2
=
self
.
partial_context
(
doc
)
ll_context1
,
_
=
rf
.
loglikelihood
(
context1
,
" "
+
target
)
ll_context2
,
_
=
rf
.
loglikelihood
(
context2
,
" "
+
target
)
return
ll_context1
,
ll_context2
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return
{
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
}
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
import
numpy
as
np
import
random
from
lm_eval.base
import
rf
,
mean
from
.
common
import
HFTask
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class
WinogradSchemaChallenge273
(
HFTask
):
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
def
__init__
(
self
):
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
def
__clean_data
(
self
):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data
=
[]
for
doc
in
self
.
data
[
"test"
]:
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"options"
][
0
]
=
self
.
__normalize_option
(
doc
[
"options"
][
0
],
doc
)
doc
[
"options"
][
1
]
=
self
.
__normalize_option
(
doc
[
"options"
][
1
],
doc
)
data
.
append
(
doc
)
return
{
"test"
:
data
}
def
__normalize_option
(
self
,
option
,
doc
):
# Append `'s` to possessive determiner based options.
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
'pronoun_loc'
]
-
2
]
==
'.'
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
def
has_training_docs
(
self
):
return
False
def
has_validation_docs
(
self
):
return
False
def
has_test_docs
(
self
):
return
True
def
fewshot_examples
(
self
,
k
):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return
random
.
sample
(
list
(
self
.
test_docs
()),
k
)
def
fewshot_description
(
self
):
# TODO: redo description
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@
classmethod
def
partial_context
(
cls
,
doc
):
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
context1
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
0
]
context2
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
1
]
return
context1
,
context2
@
classmethod
def
partial_target
(
cls
,
doc
):
# The target is everything after the document specified pronoun.
start_index
=
doc
[
"pronoun_loc"
]
+
len
(
doc
[
"pronoun"
])
return
doc
[
"text"
][
start_index
:].
strip
()
def
doc_to_text
(
self
,
doc
):
context1
,
context2
=
self
.
partial_context
(
doc
)
return
context1
+
'
\n
'
+
context2
+
'
\n
'
def
doc_to_target
(
self
,
doc
):
return
self
.
partial_target
(
doc
)
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target
=
self
.
partial_target
(
doc
)
context1
,
context2
=
self
.
partial_context
(
doc
)
ll_context1
,
_
=
rf
.
loglikelihood
(
context1
,
" "
+
target
)
ll_context2
,
_
=
rf
.
loglikelihood
(
context2
,
" "
+
target
)
return
ll_context1
,
ll_context2
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return
{
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
}
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment