Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
0b3b7251
Commit
0b3b7251
authored
Jan 08, 2021
by
Leo Gao
Browse files
Merge branch 'bmk_refactor2' of github.com:EleutherAI/lm_evaluation_harness into bmk_refactor2
parents
a18104a4
5ce42fc0
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
164 additions
and
61 deletions
+164
-61
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+5
-3
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+5
-16
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+46
-0
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hellaswag.py
+5
-3
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+5
-2
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+5
-3
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+5
-3
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+4
-2
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+5
-2
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+5
-3
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+1
-1
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+5
-3
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+5
-2
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+31
-3
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+5
-2
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+5
-3
lm_eval/tasks/wikitext.py
lm_eval/tasks/wikitext.py
+12
-4
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+5
-3
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+5
-3
No files found.
lm_eval/tasks/anli.py
View file @
0b3b7251
...
...
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a
=
"True, False, or Neither?"
+
((
" "
+
[
"True"
,
"Neither"
,
"False"
][
doc
[
'label'
]])
if
include_target
else
''
)
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
ANLIRound1
(
ANLIBase
):
SPLIT
=
1
...
...
lm_eval/tasks/drop.py
View file @
0b3b7251
...
...
@@ -58,23 +58,12 @@ class DROP(Dataset):
text
=
''
.
join
([
text
,
get_answer
(
pair
[
'answer'
])])
qa_texts
.
append
(
text
)
return
''
.
join
([
doctext
,
'
\n
'
.
join
(
qa_texts
)])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def
fewshot_description
(
self
):
return
"Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/glue.py
View file @
0b3b7251
...
...
@@ -46,6 +46,12 @@ class CoLA(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -99,6 +105,11 @@ class MNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -153,6 +164,11 @@ class MRPC(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -190,6 +206,11 @@ class RTE(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -227,6 +248,11 @@ class QNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -265,6 +291,11 @@ class QQP(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -304,6 +335,11 @@ class STSB(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -359,6 +395,11 @@ class SST(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -397,6 +438,11 @@ class WNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
lm_eval/tasks/hellaswag.py
View file @
0b3b7251
...
...
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text
+=
doc
[
'endings'
][
index
]
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/lambada.py
View file @
0b3b7251
...
...
@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[]
return
doc
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/naturalqs.py
View file @
0b3b7251
...
...
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/openbookqa.py
View file @
0b3b7251
...
...
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text
+=
doc
[
'choices'
][
'text'
][
index
]
+
'.'
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/piqa.py
View file @
0b3b7251
...
...
@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline
return
doc
[
'goal'
]
+
' '
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/quac.py
View file @
0b3b7251
...
...
@@ -61,5 +61,8 @@ class QuAC(Dataset):
text
+=
doc
[
'answer'
]
return
text
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/race.py
View file @
0b3b7251
...
...
@@ -67,6 +67,8 @@ class RACE(HFTask):
return
r
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/sat.py
View file @
0b3b7251
...
...
@@ -18,6 +18,7 @@ class SATAnalogies(Dataset):
# We should be using a checksum here.
# The canonical sha256 hash is below:
# 9dece377d8d57253ef8c78370ff15de0bb1d9e90a82c815a67ba1e621e921bfc
if
not
os
.
path
.
exists
(
'data/sat/SAT-package-V3.txt'
):
raise
NotImplementedError
(
'SAT Analogies dataset is not provided. Follow instructions on https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) to locate.'
)
...
...
@@ -32,7 +33,6 @@ class SATAnalogies(Dataset):
def
training_docs
(
self
):
return
[]
def
test_docs
(
self
):
return
[]
...
...
lm_eval/tasks/squad.py
View file @
0b3b7251
...
...
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text
+=
answer
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/storycloze.py
View file @
0b3b7251
...
...
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else
:
return
' '
.
join
([
*
doc
[
1
:
5
]])
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/superglue.py
View file @
0b3b7251
...
...
@@ -82,6 +82,11 @@ class CommitmentBank(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -126,6 +131,11 @@ class Copa(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -171,6 +181,11 @@ class MultiRC(HFTask):
return
f
"[
{
label_str
}
]
{
answer
}
"
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds
=
[]
for
doc
in
docs
:
ctx
=
self
.
fewshot_context
(
...
...
@@ -226,6 +241,11 @@ class WordsInContext(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -286,6 +306,11 @@ class SGWinogradSchemaChallenge(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -320,7 +345,10 @@ class RTE(HFTask):
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
,
answer
])
else
:
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
#TODO:
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/triviaqa.py
View file @
0b3b7251
...
...
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
,
doc
[
'Answer'
][
'Aliases'
][
0
]])
else
:
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
])
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/webqs.py
View file @
0b3b7251
...
...
@@ -29,6 +29,8 @@ class WebQs(HFTask):
a
=
"A:"
+
((
" "
+
doc
[
'answers'
][
0
])
if
include_target
else
''
)
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wikitext.py
View file @
0b3b7251
...
...
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
WikiText2
(
NLP_TASK
):
...
...
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/winogrande.py
View file @
0b3b7251
...
...
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text
=
text
.
replace
(
"_"
,
answer
)
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wsc273.py
View file @
0b3b7251
...
...
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text
=
doc
[
'completions'
][
'T'
]
+
' True. '
+
doc
[
'completions'
][
'F'
]
+
' False.'
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment