Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
105fa974
You need to sign in or sign up before continuing.
Commit
105fa974
authored
Jun 04, 2021
by
Leo Gao
Browse files
Add task versioning
parent
f76e6367
Changes
38
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
47 additions
and
7 deletions
+47
-7
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+1
-0
lm_eval/tasks/pubmedqa.py
lm_eval/tasks/pubmedqa.py
+1
-0
lm_eval/tasks/qa4mre.py
lm_eval/tasks/qa4mre.py
+1
-0
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+1
-0
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+1
-0
lm_eval/tasks/sciq.py
lm_eval/tasks/sciq.py
+1
-0
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+1
-0
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+1
-0
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+7
-0
lm_eval/tasks/translation.py
lm_eval/tasks/translation.py
+1
-0
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+1
-0
lm_eval/tasks/unscramble.py
lm_eval/tasks/unscramble.py
+1
-0
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+1
-0
lm_eval/tasks/wikitext.py
lm_eval/tasks/wikitext.py
+2
-0
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+1
-0
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+1
-0
main.py
main.py
+22
-7
tests/test_tasks.py
tests/test_tasks.py
+2
-0
No files found.
lm_eval/tasks/piqa.py
View file @
105fa974
...
@@ -5,6 +5,7 @@ from . common import HFTask
...
@@ -5,6 +5,7 @@ from . common import HFTask
class
PiQA
(
HFTask
,
MultipleChoiceTask
):
class
PiQA
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"piqa"
DATASET_PATH
=
"piqa"
DATASET_NAME
=
None
DATASET_NAME
=
None
...
...
lm_eval/tasks/pubmedqa.py
View file @
105fa974
...
@@ -5,6 +5,7 @@ from ..metrics import mean
...
@@ -5,6 +5,7 @@ from ..metrics import mean
class
Pubmed_QA
(
HFTask
):
class
Pubmed_QA
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"pubmed_qa"
DATASET_PATH
=
"pubmed_qa"
DATASET_NAME
=
"pqa_labeled"
DATASET_NAME
=
"pqa_labeled"
...
...
lm_eval/tasks/qa4mre.py
View file @
105fa974
...
@@ -5,6 +5,7 @@ from lm_eval.base import MultipleChoiceTask
...
@@ -5,6 +5,7 @@ from lm_eval.base import MultipleChoiceTask
class
QA4MRE
(
MultipleChoiceTask
):
class
QA4MRE
(
MultipleChoiceTask
):
VERSION
=
0
YEAR
=
None
YEAR
=
None
def
download
(
self
):
def
download
(
self
):
year
=
self
.
YEAR
year
=
self
.
YEAR
...
...
lm_eval/tasks/race.py
View file @
105fa974
...
@@ -15,6 +15,7 @@ class each:
...
@@ -15,6 +15,7 @@ class each:
class
RACE
(
HFTask
):
class
RACE
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"race"
DATASET_PATH
=
"race"
DATASET_NAME
=
"high"
DATASET_NAME
=
"high"
...
...
lm_eval/tasks/sat.py
View file @
105fa974
...
@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask
...
@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask
class
SATAnalogies
(
MultipleChoiceTask
):
class
SATAnalogies
(
MultipleChoiceTask
):
VERSION
=
0
NEEDS_MANUAL_DL
=
True
NEEDS_MANUAL_DL
=
True
def
__init__
(
self
):
def
__init__
(
self
):
...
...
lm_eval/tasks/sciq.py
View file @
105fa974
...
@@ -6,6 +6,7 @@ from best_download import download_file
...
@@ -6,6 +6,7 @@ from best_download import download_file
class
SciQ
(
MultipleChoiceTask
):
class
SciQ
(
MultipleChoiceTask
):
VERSION
=
0
# Multiple languages and multiple years
# Multiple languages and multiple years
def
download
(
self
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/sciq'
):
if
not
os
.
path
.
exists
(
'data/sciq'
):
...
...
lm_eval/tasks/squad.py
View file @
105fa974
...
@@ -18,6 +18,7 @@ def _squad_agg(key, items):
...
@@ -18,6 +18,7 @@ def _squad_agg(key, items):
class
SQuAD2
(
HFTask
):
class
SQuAD2
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"squad_v2"
DATASET_PATH
=
"squad_v2"
DATASET_NAME
=
None
DATASET_NAME
=
None
...
...
lm_eval/tasks/storycloze.py
View file @
105fa974
...
@@ -3,6 +3,7 @@ from lm_eval.base import Task
...
@@ -3,6 +3,7 @@ from lm_eval.base import Task
class
StoryCloze
(
Task
):
class
StoryCloze
(
Task
):
VERSION
=
0
NEEDS_MANUAL_DL
=
True
NEEDS_MANUAL_DL
=
True
def
download
(
self
):
def
download
(
self
):
...
...
lm_eval/tasks/superglue.py
View file @
105fa974
...
@@ -13,6 +13,7 @@ from ..utils import general_detokenize
...
@@ -13,6 +13,7 @@ from ..utils import general_detokenize
class
BoolQ
(
HFTask
):
class
BoolQ
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"boolq"
DATASET_NAME
=
"boolq"
...
@@ -64,6 +65,7 @@ class BoolQ(HFTask):
...
@@ -64,6 +65,7 @@ class BoolQ(HFTask):
class
CommitmentBank
(
HFTask
):
class
CommitmentBank
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"cb"
DATASET_NAME
=
"cb"
...
@@ -135,6 +137,7 @@ class CommitmentBank(HFTask):
...
@@ -135,6 +137,7 @@ class CommitmentBank(HFTask):
class
Copa
(
HFTask
):
class
Copa
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"copa"
DATASET_NAME
=
"copa"
...
@@ -199,6 +202,7 @@ class Copa(HFTask):
...
@@ -199,6 +202,7 @@ class Copa(HFTask):
class
MultiRC
(
HFTask
):
class
MultiRC
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"multirc"
DATASET_NAME
=
"multirc"
...
@@ -253,6 +257,7 @@ class MultiRC(HFTask):
...
@@ -253,6 +257,7 @@ class MultiRC(HFTask):
class
ReCoRD
(
HFTask
):
class
ReCoRD
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"record"
DATASET_NAME
=
"record"
...
@@ -345,6 +350,7 @@ class ReCoRD(HFTask):
...
@@ -345,6 +350,7 @@ class ReCoRD(HFTask):
class
WordsInContext
(
HFTask
):
class
WordsInContext
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
DATASET_NAME
=
"wic"
DATASET_NAME
=
"wic"
...
@@ -400,6 +406,7 @@ class WordsInContext(HFTask):
...
@@ -400,6 +406,7 @@ class WordsInContext(HFTask):
class
SGWinogradSchemaChallenge
(
HFTask
):
class
SGWinogradSchemaChallenge
(
HFTask
):
VERSION
=
0
# Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
# Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
# binary version of the task.
# binary version of the task.
DATASET_PATH
=
"super_glue"
DATASET_PATH
=
"super_glue"
...
...
lm_eval/tasks/translation.py
View file @
105fa974
...
@@ -36,6 +36,7 @@ def create_translation_task(dataset, language_pair):
...
@@ -36,6 +36,7 @@ def create_translation_task(dataset, language_pair):
return
TranslationTask
return
TranslationTask
class
GeneralTranslationTask
(
Task
):
class
GeneralTranslationTask
(
Task
):
VERSION
=
0
# e.g. ("wmt14", "fr-en")
# e.g. ("wmt14", "fr-en")
def
__init__
(
self
,
sacrebleu_dataset
,
sacrebleu_language_pair
=
None
):
def
__init__
(
self
,
sacrebleu_dataset
,
sacrebleu_language_pair
=
None
):
...
...
lm_eval/tasks/triviaqa.py
View file @
105fa974
...
@@ -6,6 +6,7 @@ from ..utils import sh
...
@@ -6,6 +6,7 @@ from ..utils import sh
class
TriviaQA
(
Task
):
class
TriviaQA
(
Task
):
VERSION
=
0
def
download
(
self
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
sh
(
"""
sh
(
"""
...
...
lm_eval/tasks/unscramble.py
View file @
105fa974
...
@@ -14,6 +14,7 @@ def extract_gzip(gz, to):
...
@@ -14,6 +14,7 @@ def extract_gzip(gz, to):
class
WordUnscrambleTask
(
Task
):
class
WordUnscrambleTask
(
Task
):
VERSION
=
0
BASE_PATH
=
Path
(
"data/unscramble"
)
BASE_PATH
=
Path
(
"data/unscramble"
)
FILENAME
=
None
FILENAME
=
None
CHECKSUM
=
None
# SHA256 Checksum.
CHECKSUM
=
None
# SHA256 Checksum.
...
...
lm_eval/tasks/webqs.py
View file @
105fa974
...
@@ -4,6 +4,7 @@ from ..metrics import mean
...
@@ -4,6 +4,7 @@ from ..metrics import mean
class
WebQs
(
HFTask
):
class
WebQs
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"web_questions"
DATASET_PATH
=
"web_questions"
DATASET_NAME
=
None
DATASET_NAME
=
None
...
...
lm_eval/tasks/wikitext.py
View file @
105fa974
...
@@ -2,6 +2,7 @@ from . common import HFTask
...
@@ -2,6 +2,7 @@ from . common import HFTask
class
WikiText103
(
HFTask
):
class
WikiText103
(
HFTask
):
VERSION
=
0
NLP_PATH
=
"wikitext"
NLP_PATH
=
"wikitext"
NLP_NAME
=
"wikitext-103-raw-v1"
NLP_NAME
=
"wikitext-103-raw-v1"
...
@@ -64,6 +65,7 @@ class WikiText103(HFTask):
...
@@ -64,6 +65,7 @@ class WikiText103(HFTask):
class
WikiText2
(
HFTask
):
class
WikiText2
(
HFTask
):
VERSION
=
0
NLP_PATH
=
"wikitext"
NLP_PATH
=
"wikitext"
NLP_NAME
=
"wikitext-2-raw-v1"
NLP_NAME
=
"wikitext-2-raw-v1"
...
...
lm_eval/tasks/winogrande.py
View file @
105fa974
...
@@ -11,6 +11,7 @@ Reference: https://arxiv.org/abs/1806.02847
...
@@ -11,6 +11,7 @@ Reference: https://arxiv.org/abs/1806.02847
class
Winogrande
(
HFTask
):
class
Winogrande
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"winogrande"
DATASET_PATH
=
"winogrande"
DATASET_NAME
=
"winogrande_xl"
DATASET_NAME
=
"winogrande_xl"
...
...
lm_eval/tasks/wsc273.py
View file @
105fa974
...
@@ -12,6 +12,7 @@ See: https://arxiv.org/abs/1806.02847
...
@@ -12,6 +12,7 @@ See: https://arxiv.org/abs/1806.02847
class
WinogradSchemaChallenge273
(
HFTask
):
class
WinogradSchemaChallenge273
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"winograd_wsc"
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
DATASET_NAME
=
"wsc273"
...
...
main.py
View file @
105fa974
...
@@ -53,20 +53,35 @@ def main():
...
@@ -53,20 +53,35 @@ def main():
f
.
write
(
dumped
)
f
.
write
(
dumped
)
# MAKE TABLE
# MAKE TABLE
from
pytablewriter
import
MarkdownTableWriter
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
writer
=
MarkdownTableWriter
()
md_writer
=
MarkdownTableWriter
()
writer
.
headers
=
[
"Task"
,
"Metric"
,
"Value"
]
latex_writer
=
LatexTableWriter
()
md_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
latex_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
values
=
[]
values
=
[]
for
k
,
dic
in
results
.
items
():
for
k
,
dic
in
results
[
"results"
].
items
():
version
=
results
[
"versions"
][
k
]
for
m
,
v
in
dic
.
items
():
for
m
,
v
in
dic
.
items
():
values
.
append
([
k
,
m
,
'%.4f'
%
v
])
if
m
.
endswith
(
"_stderr"
):
continue
if
m
+
"_stderr"
in
dic
:
se
=
dic
[
m
+
"_stderr"
]
values
.
append
([
k
,
version
,
m
,
'%.4f'
%
v
,
'±'
,
'%.4f'
%
se
])
else
:
values
.
append
([
k
,
version
,
m
,
'%.4f'
%
v
,
''
,
''
])
k
=
""
k
=
""
writer
.
value_matrix
=
values
version
=
""
md_writer
.
value_matrix
=
values
latex_writer
.
value_matrix
=
values
# todo: make latex table look good
# print(latex_writer.dumps())
print
(
writer
.
dumps
())
print
(
md_
writer
.
dumps
())
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
tests/test_tasks.py
View file @
105fa974
...
@@ -22,6 +22,8 @@ def test_basic_interface(taskname, Task):
...
@@ -22,6 +22,8 @@ def test_basic_interface(taskname, Task):
for
v
in
task
.
higher_is_better
().
values
():
assert
v
in
[
True
,
False
]
for
v
in
task
.
higher_is_better
().
values
():
assert
v
in
[
True
,
False
]
assert
isinstance
(
task
.
VERSION
,
int
)
# test deterministic docs
# test deterministic docs
# (don't test train because it's slow)
# (don't test train because it's slow)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment