Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b5e86d3f
Unverified
Commit
b5e86d3f
authored
Feb 02, 2021
by
Jonathan Tow
Committed by
GitHub
Feb 02, 2021
Browse files
Merge branch 'master' into wsc273-evaluation
parents
c32a13e8
a1a4a32e
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
177 additions
and
172 deletions
+177
-172
lm_eval/base.py
lm_eval/base.py
+19
-13
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+2
-2
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+4
-5
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+2
-2
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+2
-2
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+2
-2
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+3
-3
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+2
-2
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+2
-2
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-2
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+2
-2
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+2
-2
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+133
-133
No files found.
lm_eval/base.py
View file @
b5e86d3f
...
@@ -58,10 +58,10 @@ class LM(abc.ABC):
...
@@ -58,10 +58,10 @@ class LM(abc.ABC):
return
cls
()
return
cls
()
class
Dataset
(
abc
.
ABC
):
class
Task
(
abc
.
ABC
):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
download
()
self
.
download
()
self
.
_traindocs
=
None
self
.
_train
ing_
docs
=
None
def
download
(
self
):
def
download
(
self
):
"""Downloads the task dataset if necessary"""
"""Downloads the task dataset if necessary"""
...
@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
...
@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
"""Whether the task has a training set"""
"""Whether the task has a training set"""
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
has_validation_docs
(
self
):
def
has_validation_docs
(
self
):
"""Whether the task has a validation set"""
"""Whether the task has a validation set"""
...
@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
...
@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def
training_docs
(
self
):
def
training_docs
(
self
):
"""
"""
:return: Iterable[obj]
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
A iterable of any object, that doc_to_text can handle
"""
"""
return
[]
return
[]
def
validation_docs
(
self
):
def
validation_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
return
[]
def
test_docs
(
self
):
def
test_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
return
[]
def
fewshot_examples
(
self
,
k
):
if
self
.
_traindocs
is
None
:
self
.
_traindocs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_traindocs
,
k
)
def
fewshot_examples
(
self
,
k
):
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_training_docs
,
k
)
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
...
@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
...
@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
part of the document for `doc`.
"""
"""
pass
pass
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
"""Take a single document and the LM results and evaluates, returning a
...
@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
...
@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
raw_description
=
self
.
fewshot_description
()
raw_description
=
self
.
fewshot_description
()
description
=
(
raw_description
+
"
\n
===
\n\n
"
)
if
provide_description
and
raw_description
else
""
description
=
(
raw_description
+
"
\n
===
\n\n
"
)
if
provide_description
and
raw_description
else
""
if
num_fewshot
==
0
:
if
num_fewshot
==
0
:
labeled_examples
=
""
labeled_examples
=
""
else
:
else
:
...
...
lm_eval/tasks/arithmetic.py
View file @
b5e86d3f
...
@@ -2,12 +2,12 @@ import abc
...
@@ -2,12 +2,12 @@ import abc
import
json
import
json
import
os
import
os
from
collections
import
namedtuple
from
collections
import
namedtuple
from
lm_eval.base
import
Dataset
,
mean
,
rf
from
lm_eval.base
import
Task
,
mean
,
rf
from
best_download
import
download_file
from
best_download
import
download_file
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
class
Arithmetic
(
Dataset
):
class
Arithmetic
(
Task
):
directory
=
'data/arithmetic/'
directory
=
'data/arithmetic/'
def
__init__
(
self
):
def
__init__
(
self
):
...
...
lm_eval/tasks/common.py
View file @
b5e86d3f
import
datasets
import
datasets
import
numpy
as
np
import
numpy
as
np
import
random
from
..base
import
Task
from
..base
import
Dataset
class
HFTask
(
Dataset
):
class
HFTask
(
Task
):
DATASET_PATH
=
None
DATASET_PATH
=
None
DATASET_NAME
=
None
DATASET_NAME
=
None
def
__init__
(
self
):
def
__init__
(
self
):
self
.
data
=
None
super
().
__init__
()
super
().
__init__
()
self
.
_training_docs
=
None
def
download
(
self
):
def
download
(
self
):
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
...
...
lm_eval/tasks/coqa.py
View file @
b5e86d3f
...
@@ -2,11 +2,11 @@
...
@@ -2,11 +2,11 @@
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
from
..utils
import
sh
class
CoQA
(
Dataset
):
class
CoQA
(
Task
):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
download
()
self
.
download
()
def
download
(
self
):
def
download
(
self
):
...
...
lm_eval/tasks/drop.py
View file @
b5e86d3f
...
@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
...
@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from
tqdm
import
auto
as
tqdm_lib
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
from
pathlib
import
Path
from
pathlib
import
Path
from
..base
import
Dataset
from
..base
import
Task
class
DROP
(
Dataset
):
class
DROP
(
Task
):
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
def
__init__
(
self
):
def
__init__
(
self
):
...
...
lm_eval/tasks/lambada.py
View file @
b5e86d3f
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
lm_eval.utils
import
sh
from
lm_eval.utils
import
sh
import
json
import
json
import
math
import
math
from
best_download
import
download_file
from
best_download
import
download_file
class
LAMBADA
(
Dataset
):
class
LAMBADA
(
Task
):
def
download
(
self
):
def
download
(
self
):
sh
(
"mkdir -p data/lambada"
)
sh
(
"mkdir -p data/lambada"
)
download_file
(
download_file
(
...
...
lm_eval/tasks/naturalqs.py
View file @
b5e86d3f
...
@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
...
@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
def
fewshot_examples
(
self
,
k
):
def
fewshot_examples
(
self
,
k
):
# Data is too large to fit in memory. We just sample from the first bit.
# Data is too large to fit in memory. We just sample from the first bit.
if
self
.
_traindocs
is
None
:
if
self
.
_train
ing_
docs
is
None
:
self
.
_traindocs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
self
.
_train
ing_
docs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
return
random
.
sample
(
self
.
_traindocs
,
k
)
return
random
.
sample
(
self
.
_train
ing_
docs
,
k
)
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A: '
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A: '
...
...
lm_eval/tasks/piqa.py
View file @
b5e86d3f
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
..utils
import
sh
from
..utils
import
sh
import
os
import
os
class
PiQA
(
Dataset
):
class
PiQA
(
Task
):
def
download
(
self
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/piqa'
):
if
not
os
.
path
.
exists
(
'data/piqa'
):
#TODO: use best_download
#TODO: use best_download
...
...
lm_eval/tasks/quac.py
View file @
b5e86d3f
import
json
import
json
import
random
import
random
import
os
import
os
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
from
..utils
import
sh
class
QuAC
(
Dataset
):
class
QuAC
(
Task
):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
...
...
lm_eval/tasks/sat.py
View file @
b5e86d3f
import
json
import
json
import
random
import
random
import
os
import
os
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
tqdm
import
auto
as
tqdm_lib
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
simple_accuracy_metric
from
.
common
import
simple_accuracy_metric
import
numpy
as
np
import
numpy
as
np
from
..utils
import
sh
from
..utils
import
sh
class
SATAnalogies
(
Dataset
):
class
SATAnalogies
(
Task
):
NEEDS_MANUAL_DL
=
True
NEEDS_MANUAL_DL
=
True
def
__init__
(
self
):
def
__init__
(
self
):
...
...
lm_eval/tasks/storycloze.py
View file @
b5e86d3f
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
from
..utils
import
sh
import
csv
import
csv
class
StoryCloze
(
Dataset
):
class
StoryCloze
(
Task
):
NEEDS_MANUAL_DL
=
True
NEEDS_MANUAL_DL
=
True
def
download
(
self
):
def
download
(
self
):
...
...
lm_eval/tasks/triviaqa.py
View file @
b5e86d3f
import
os
import
os
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Dataset
,
mean
,
rf
from
lm_eval.base
import
Task
,
mean
,
rf
from
..utils
import
sh
from
..utils
import
sh
class
TriviaQA
(
Dataset
):
class
TriviaQA
(
Task
):
def
download
(
self
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
sh
(
"""
sh
(
"""
...
...
lm_eval/tasks/wsc273.py
View file @
b5e86d3f
import
numpy
as
np
import
numpy
as
np
import
random
import
random
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
rf
,
mean
from
.
common
import
HFTask
from
.
common
import
HFTask
"""
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
See: https://arxiv.org/abs/1806.02847
"""
"""
class
WinogradSchemaChallenge273
(
HFTask
):
class
WinogradSchemaChallenge273
(
HFTask
):
DATASET_PATH
=
"winograd_wsc"
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
DATASET_NAME
=
"wsc273"
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
self
.
data
=
self
.
__clean_data
()
def
__clean_data
(
self
):
def
__clean_data
(
self
):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data
=
[]
data
=
[]
for
doc
in
self
.
data
[
"test"
]:
for
doc
in
self
.
data
[
"test"
]:
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"options"
][
0
]
=
self
.
__normalize_option
(
doc
[
"options"
][
0
],
doc
)
doc
[
"options"
][
0
]
=
self
.
__normalize_option
(
doc
[
"options"
][
0
],
doc
)
doc
[
"options"
][
1
]
=
self
.
__normalize_option
(
doc
[
"options"
][
1
],
doc
)
doc
[
"options"
][
1
]
=
self
.
__normalize_option
(
doc
[
"options"
][
1
],
doc
)
data
.
append
(
doc
)
data
.
append
(
doc
)
return
{
"test"
:
data
}
return
{
"test"
:
data
}
def
__normalize_option
(
self
,
option
,
doc
):
def
__normalize_option
(
self
,
option
,
doc
):
# Append `'s` to possessive determiner based options.
# Append `'s` to possessive determiner based options.
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
option
+=
"'s"
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
'pronoun_loc'
]
-
2
]
==
'.'
start_of_sentence
=
doc
[
"text"
][
doc
[
'pronoun_loc'
]
-
2
]
==
'.'
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
return
option
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
return
False
return
False
def
has_validation_docs
(
self
):
def
has_validation_docs
(
self
):
return
False
return
False
def
has_test_docs
(
self
):
def
has_test_docs
(
self
):
return
True
return
True
def
fewshot_examples
(
self
,
k
):
def
fewshot_examples
(
self
,
k
):
# NOTE: `super().fewshot_examples` samples from training docs which are
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
# not available for this test-set-only dataset.
return
random
.
sample
(
list
(
self
.
test_docs
()),
k
)
return
random
.
sample
(
list
(
self
.
test_docs
()),
k
)
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: redo description
# TODO: redo description
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@
classmethod
@
classmethod
def
partial_context
(
cls
,
doc
):
def
partial_context
(
cls
,
doc
):
# Substitute the pronoun in the original text with each candidate
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
# choice and ignore everything after.
context1
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
0
]
context1
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
0
]
context2
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
1
]
context2
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
1
]
return
context1
,
context2
return
context1
,
context2
@
classmethod
@
classmethod
def
partial_target
(
cls
,
doc
):
def
partial_target
(
cls
,
doc
):
# The target is everything after the document specified pronoun.
# The target is everything after the document specified pronoun.
start_index
=
doc
[
"pronoun_loc"
]
+
len
(
doc
[
"pronoun"
])
start_index
=
doc
[
"pronoun_loc"
]
+
len
(
doc
[
"pronoun"
])
return
doc
[
"text"
][
start_index
:].
strip
()
return
doc
[
"text"
][
start_index
:].
strip
()
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
context1
,
context2
=
self
.
partial_context
(
doc
)
context1
,
context2
=
self
.
partial_context
(
doc
)
return
context1
+
'
\n
'
+
context2
+
'
\n
'
return
context1
+
'
\n
'
+
context2
+
'
\n
'
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
self
.
partial_target
(
doc
)
return
self
.
partial_target
(
doc
)
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
Requests which will be sent to the LM.
:param doc:
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
language description, as well as the few shot examples, and the question
part of the document for `doc`.
part of the document for `doc`.
"""
"""
target
=
self
.
partial_target
(
doc
)
target
=
self
.
partial_target
(
doc
)
context1
,
context2
=
self
.
partial_context
(
doc
)
context1
,
context2
=
self
.
partial_context
(
doc
)
ll_context1
,
_
=
rf
.
loglikelihood
(
context1
,
" "
+
target
)
ll_context1
,
_
=
rf
.
loglikelihood
(
context1
,
" "
+
target
)
ll_context2
,
_
=
rf
.
loglikelihood
(
context2
,
" "
+
target
)
ll_context2
,
_
=
rf
.
loglikelihood
(
context2
,
" "
+
target
)
return
ll_context1
,
ll_context2
return
ll_context1
,
ll_context2
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
dict where keys are the names of submetrics and values are the values of
the metric for that one document
the metric for that one document
:param doc:
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
:param results:
The results of the requests created in construct_requests.
The results of the requests created in construct_requests.
"""
"""
return
{
return
{
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
}
}
def
aggregation
(
self
):
def
aggregation
(
self
):
"""
"""
:returns: {str: [float] -> float}
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metrics
"""
"""
return
{
return
{
"acc"
:
mean
"acc"
:
mean
}
}
def
higher_is_better
(
self
):
def
higher_is_better
(
self
):
"""
"""
:returns: {str: bool}
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
whether a higher value of the submetric is better
"""
"""
return
{
return
{
"acc"
:
True
"acc"
:
True
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment