Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2d2dbf96
Commit
2d2dbf96
authored
Jan 05, 2021
by
Leo Gao
Browse files
Add comments to remind that evaluation needs to be written for the new framework
parent
6803e647
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
168 additions
and
63 deletions
+168
-63
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+5
-3
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+5
-16
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+46
-0
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hellaswag.py
+5
-3
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+5
-2
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+5
-3
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+5
-3
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+4
-2
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+5
-2
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+5
-3
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+5
-3
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+5
-3
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+5
-2
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+31
-3
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+5
-2
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+5
-3
lm_eval/tasks/wikitext.py
lm_eval/tasks/wikitext.py
+12
-4
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+5
-3
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+5
-3
No files found.
lm_eval/tasks/anli.py
View file @
2d2dbf96
...
...
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a
=
"True, False, or Neither?"
+
((
" "
+
[
"True"
,
"Neither"
,
"False"
][
doc
[
'label'
]])
if
include_target
else
''
)
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
ANLIRound1
(
ANLIBase
):
SPLIT
=
1
...
...
lm_eval/tasks/drop.py
View file @
2d2dbf96
...
...
@@ -58,23 +58,12 @@ class DROP(Dataset):
text
=
''
.
join
([
text
,
get_answer
(
pair
[
'answer'
])])
qa_texts
.
append
(
text
)
return
''
.
join
([
doctext
,
'
\n
'
.
join
(
qa_texts
)])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def
fewshot_description
(
self
):
return
"Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/glue.py
View file @
2d2dbf96
...
...
@@ -46,6 +46,12 @@ class CoLA(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -99,6 +105,11 @@ class MNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -153,6 +164,11 @@ class MRPC(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -190,6 +206,11 @@ class RTE(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -227,6 +248,11 @@ class QNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -265,6 +291,11 @@ class QQP(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -304,6 +335,11 @@ class STSB(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -359,6 +395,11 @@ class SST(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -397,6 +438,11 @@ class WNLI(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
lm_eval/tasks/hellaswag.py
View file @
2d2dbf96
...
...
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text
+=
doc
[
'endings'
][
index
]
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/lambada.py
View file @
2d2dbf96
...
...
@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[]
return
doc
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/naturalqs.py
View file @
2d2dbf96
...
...
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/openbookqa.py
View file @
2d2dbf96
...
...
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text
+=
doc
[
'choices'
][
'text'
][
index
]
+
'.'
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/piqa.py
View file @
2d2dbf96
...
...
@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline
return
doc
[
'goal'
]
+
' '
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/quac.py
View file @
2d2dbf96
...
...
@@ -61,5 +61,8 @@ class QuAC(Dataset):
text
+=
doc
[
'answer'
]
return
text
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/race.py
View file @
2d2dbf96
...
...
@@ -67,6 +67,8 @@ class RACE(HFTask):
return
r
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/sat.py
View file @
2d2dbf96
...
...
@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/squad.py
View file @
2d2dbf96
...
...
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text
+=
answer
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/storycloze.py
View file @
2d2dbf96
...
...
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else
:
return
' '
.
join
([
*
doc
[
1
:
5
]])
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/superglue.py
View file @
2d2dbf96
...
...
@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -120,6 +125,11 @@ class Copa(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -165,6 +175,11 @@ class MultiRC(HFTask):
return
f
"[
{
label_str
}
]
{
answer
}
"
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds
=
[]
for
doc
in
docs
:
ctx
=
self
.
fewshot_context
(
...
...
@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
@@ -314,7 +339,10 @@ class RTE(HFTask):
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
,
answer
])
else
:
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
#TODO:
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/triviaqa.py
View file @
2d2dbf96
...
...
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
,
doc
[
'Answer'
][
'Aliases'
][
0
]])
else
:
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
])
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/webqs.py
View file @
2d2dbf96
...
...
@@ -29,6 +29,8 @@ class WebQs(HFTask):
a
=
"A:"
+
((
" "
+
doc
[
'answers'
][
0
])
if
include_target
else
''
)
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: implement
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wikitext.py
View file @
2d2dbf96
...
...
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
WikiText2
(
NLP_TASK
):
...
...
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/winogrande.py
View file @
2d2dbf96
...
...
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text
=
text
.
replace
(
"_"
,
answer
)
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Write evaluation function
raise
NotImplementedError
()
\ No newline at end of file
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wsc273.py
View file @
2d2dbf96
...
...
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text
=
doc
[
'completions'
][
'T'
]
+
' True. '
+
doc
[
'completions'
][
'F'
]
+
' False.'
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Write evaluation function
raise
NotImplementedError
()
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment