Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2d2dbf96
Commit
2d2dbf96
authored
Jan 05, 2021
by
Leo Gao
Browse files
Add comments to remind that evaluation needs to be written for the new framework
parent
6803e647
Changes
19
Show whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
168 additions
and
63 deletions
+168
-63
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+5
-3
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+5
-16
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+46
-0
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hellaswag.py
+5
-3
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+5
-2
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+5
-3
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+5
-3
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+4
-2
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+5
-2
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+5
-3
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+5
-3
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+5
-3
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+5
-2
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+31
-3
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+5
-2
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+5
-3
lm_eval/tasks/wikitext.py
lm_eval/tasks/wikitext.py
+12
-4
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+5
-3
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+5
-3
No files found.
lm_eval/tasks/anli.py
View file @
2d2dbf96
...
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
...
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
a
=
"True, False, or Neither?"
+
((
" "
+
[
"True"
,
"Neither"
,
"False"
][
doc
[
'label'
]])
if
include_target
else
''
)
a
=
"True, False, or Neither?"
+
((
" "
+
[
"True"
,
"Neither"
,
"False"
][
doc
[
'label'
]])
if
include_target
else
''
)
return
q
+
a
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: implement
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
ANLIRound1
(
ANLIBase
):
class
ANLIRound1
(
ANLIBase
):
SPLIT
=
1
SPLIT
=
1
...
...
lm_eval/tasks/drop.py
View file @
2d2dbf96
...
@@ -59,22 +59,11 @@ class DROP(Dataset):
...
@@ -59,22 +59,11 @@ class DROP(Dataset):
qa_texts
.
append
(
text
)
qa_texts
.
append
(
text
)
return
''
.
join
([
doctext
,
'
\n
'
.
join
(
qa_texts
)])
return
''
.
join
([
doctext
,
'
\n
'
.
join
(
qa_texts
)])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
return
"Read the passage and answer the questions "
return
"Read the passage and answer the questions "
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/glue.py
View file @
2d2dbf96
...
@@ -46,6 +46,12 @@ class CoLA(HFTask):
...
@@ -46,6 +46,12 @@ class CoLA(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -99,6 +105,11 @@ class MNLI(HFTask):
...
@@ -99,6 +105,11 @@ class MNLI(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -153,6 +164,11 @@ class MRPC(HFTask):
...
@@ -153,6 +164,11 @@ class MRPC(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -190,6 +206,11 @@ class RTE(HFTask):
...
@@ -190,6 +206,11 @@ class RTE(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -227,6 +248,11 @@ class QNLI(HFTask):
...
@@ -227,6 +248,11 @@ class QNLI(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -265,6 +291,11 @@ class QQP(HFTask):
...
@@ -265,6 +291,11 @@ class QQP(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -304,6 +335,11 @@ class STSB(HFTask):
...
@@ -304,6 +335,11 @@ class STSB(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -359,6 +395,11 @@ class SST(HFTask):
...
@@ -359,6 +395,11 @@ class SST(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -397,6 +438,11 @@ class WNLI(HFTask):
...
@@ -397,6 +438,11 @@ class WNLI(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
...
lm_eval/tasks/hellaswag.py
View file @
2d2dbf96
...
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
...
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
text
+=
doc
[
'endings'
][
index
]
text
+=
doc
[
'endings'
][
index
]
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/lambada.py
View file @
2d2dbf96
...
@@ -46,5 +46,8 @@ class Lambada(Dataset):
...
@@ -46,5 +46,8 @@ class Lambada(Dataset):
#label = doc[]
#label = doc[]
return
doc
return
doc
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
pass
\ No newline at end of file
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/naturalqs.py
View file @
2d2dbf96
...
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
...
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: implement
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/openbookqa.py
View file @
2d2dbf96
...
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
...
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
text
+=
doc
[
'choices'
][
'text'
][
index
]
+
'.'
text
+=
doc
[
'choices'
][
'text'
][
index
]
+
'.'
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/piqa.py
View file @
2d2dbf96
...
@@ -54,6 +54,8 @@ class PiQA(Dataset):
...
@@ -54,6 +54,8 @@ class PiQA(Dataset):
#TODO: check if oa uses newline
#TODO: check if oa uses newline
return
doc
[
'goal'
]
+
' '
return
doc
[
'goal'
]
+
' '
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/quac.py
View file @
2d2dbf96
...
@@ -61,5 +61,8 @@ class QuAC(Dataset):
...
@@ -61,5 +61,8 @@ class QuAC(Dataset):
text
+=
doc
[
'answer'
]
text
+=
doc
[
'answer'
]
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/race.py
View file @
2d2dbf96
...
@@ -67,6 +67,8 @@ class RACE(HFTask):
...
@@ -67,6 +67,8 @@ class RACE(HFTask):
return
r
return
r
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: implement
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/sat.py
View file @
2d2dbf96
...
@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
...
@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/squad.py
View file @
2d2dbf96
...
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
...
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
text
+=
answer
text
+=
answer
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/storycloze.py
View file @
2d2dbf96
...
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
...
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
else
:
else
:
return
' '
.
join
([
*
doc
[
1
:
5
]])
return
' '
.
join
([
*
doc
[
1
:
5
]])
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/superglue.py
View file @
2d2dbf96
...
@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
...
@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -120,6 +125,11 @@ class Copa(HFTask):
...
@@ -120,6 +125,11 @@ class Copa(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -165,6 +175,11 @@ class MultiRC(HFTask):
...
@@ -165,6 +175,11 @@ class MultiRC(HFTask):
return
f
"[
{
label_str
}
]
{
answer
}
"
return
f
"[
{
label_str
}
]
{
answer
}
"
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds
=
[]
preds
=
[]
for
doc
in
docs
:
for
doc
in
docs
:
ctx
=
self
.
fewshot_context
(
ctx
=
self
.
fewshot_context
(
...
@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
...
@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
...
@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
...
@@ -314,7 +339,10 @@ class RTE(HFTask):
...
@@ -314,7 +339,10 @@ class RTE(HFTask):
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
,
answer
])
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
,
answer
])
else
:
else
:
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
])
return
''
.
join
([
doc
[
'premise'
],
'
\n
question: '
,
doc
[
'hypothesis'
],
' True or False?
\n
answer: '
])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
#TODO:
# TODO: Implement evaluation code
pass
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
lm_eval/tasks/triviaqa.py
View file @
2d2dbf96
...
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
...
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
,
doc
[
'Answer'
][
'Aliases'
][
0
]])
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
,
doc
[
'Answer'
][
'Aliases'
][
0
]])
else
:
else
:
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
])
return
''
.
join
([
'Q: '
,
doc
[
'Question'
],
'
\n\n
'
,
'A: '
])
def
evaluate
(
self
,
docs
,
lm
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/webqs.py
View file @
2d2dbf96
...
@@ -29,6 +29,8 @@ class WebQs(HFTask):
...
@@ -29,6 +29,8 @@ class WebQs(HFTask):
a
=
"A:"
+
((
" "
+
doc
[
'answers'
][
0
])
if
include_target
else
''
)
a
=
"A:"
+
((
" "
+
doc
[
'answers'
][
0
])
if
include_target
else
''
)
return
q
+
a
return
q
+
a
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: implement
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wikitext.py
View file @
2d2dbf96
...
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
...
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
# TODO: Implement evaluation code
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
class
WikiText2
(
NLP_TASK
):
class
WikiText2
(
NLP_TASK
):
...
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
...
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
doc
[
'text'
]
return
doc
[
'text'
]
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
pass
# TODO: Implement evaluation code
\ No newline at end of file
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/winogrande.py
View file @
2d2dbf96
...
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
...
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
text
=
text
.
replace
(
"_"
,
answer
)
text
=
text
.
replace
(
"_"
,
answer
)
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
\ No newline at end of file
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
\ No newline at end of file
lm_eval/tasks/wsc273.py
View file @
2d2dbf96
...
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
...
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
text
=
doc
[
'completions'
][
'T'
]
+
' True. '
+
doc
[
'completions'
][
'F'
]
+
' False.'
text
=
doc
[
'completions'
][
'T'
]
+
' True. '
+
doc
[
'completions'
][
'F'
]
+
' False.'
return
text
return
text
def
evaluate
(
self
,
docs
,
lm
):
# TODO: Implement evaluation code
# TODO: Write evaluation function
raise
NotImplementedError
()
# ***IMPORTANT***: this evaluation function needs to be written for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment