Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
105fa974
Commit
105fa974
authored
Jun 04, 2021
by
Leo Gao
Browse files
Add task versioning
parent
f76e6367
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
46 additions
and
1 deletion
+46
-1
lm_eval/evaluator.py
lm_eval/evaluator.py
+6
-1
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+1
-0
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+1
-0
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+1
-0
lm_eval/tasks/cbt.py
lm_eval/tasks/cbt.py
+2
-0
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+1
-0
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+1
-0
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+10
-0
lm_eval/tasks/headqa.py
lm_eval/tasks/headqa.py
+1
-0
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hellaswag.py
+1
-0
lm_eval/tasks/hendrycks_ethics.py
lm_eval/tasks/hendrycks_ethics.py
+6
-0
lm_eval/tasks/hendrycks_math.py
lm_eval/tasks/hendrycks_math.py
+7
-0
lm_eval/tasks/hendrycks_test.py
lm_eval/tasks/hendrycks_test.py
+1
-0
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+1
-0
lm_eval/tasks/lambada_cloze.py
lm_eval/tasks/lambada_cloze.py
+1
-0
lm_eval/tasks/logiqa.py
lm_eval/tasks/logiqa.py
+1
-0
lm_eval/tasks/mathqa.py
lm_eval/tasks/mathqa.py
+1
-0
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+1
-0
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+1
-0
lm_eval/tasks/pile.py
lm_eval/tasks/pile.py
+1
-0
No files found.
lm_eval/evaluator.py
View file @
105fa974
...
...
@@ -10,6 +10,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task_dict_items
=
[(
name
,
task
)
for
name
,
task
in
task_dict
.
items
()
if
(
task
.
has_validation_docs
()
or
task
.
has_test_docs
())]
results
=
collections
.
defaultdict
(
dict
)
versions
=
collections
.
defaultdict
(
dict
)
requests
=
collections
.
defaultdict
(
list
)
requests_origin
=
collections
.
defaultdict
(
list
)
...
...
@@ -24,6 +25,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
# get lists of each type of requeste
for
task_name
,
task
in
task_dict_items
:
versions
[
task_name
]
=
task
.
VERSION
#default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if
task
.
has_test_docs
():
...
...
@@ -95,4 +97,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
]
=
stderr
(
items
)
return
results
return
{
"results"
:
results
,
"versions"
:
versions
}
lm_eval/tasks/anli.py
View file @
105fa974
...
...
@@ -5,6 +5,7 @@ from . common import HFTask
class
ANLIBase
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"anli"
DATASET_NAME
=
None
SPLIT
=
None
...
...
lm_eval/tasks/arc.py
View file @
105fa974
...
...
@@ -3,6 +3,7 @@ from . common import HFTask
class
ARCEasy
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"ai2_arc"
DATASET_NAME
=
"ARC-Easy"
...
...
lm_eval/tasks/arithmetic.py
View file @
105fa974
...
...
@@ -10,6 +10,7 @@ ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class
Arithmetic
(
Task
):
VERSION
=
0
directory
=
'data/arithmetic/'
def
__init__
(
self
):
...
...
lm_eval/tasks/cbt.py
View file @
105fa974
...
...
@@ -15,6 +15,8 @@ class CBTBase(HFTask):
DATASET_PATH
=
"cbt"
DATASET_NAME
=
None
VERSION
=
0
def
fewshot_description
(
self
):
# TODO: Figure out description.
return
""
...
...
lm_eval/tasks/coqa.py
View file @
105fa974
...
...
@@ -7,6 +7,7 @@ from itertools import zip_longest
class
CoQA
(
Task
):
VERSION
=
0
def
download
(
self
):
coqa_train_filepath
=
'data/coqa/coqa-train-v1.0.json'
...
...
lm_eval/tasks/drop.py
View file @
105fa974
...
...
@@ -16,6 +16,7 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
class
DROP
(
Task
):
VERSION
=
0
DATASET_PATH
=
Path
(
"data/drop"
)
def
download
(
self
):
...
...
lm_eval/tasks/glue.py
View file @
105fa974
...
...
@@ -8,6 +8,7 @@ from ..utils import general_detokenize
class
CoLA
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"cola"
...
...
@@ -55,6 +56,7 @@ class CoLA(HFTask):
class
SST
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"sst2"
...
...
@@ -106,6 +108,7 @@ class SST(HFTask):
class
MNLI
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"mnli"
...
...
@@ -163,6 +166,7 @@ class MNLI(HFTask):
class
MNLIMismatched
(
MNLI
):
VERSION
=
0
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
...
...
@@ -174,6 +178,7 @@ class MNLIMismatched(MNLI):
class
QNLI
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"qnli"
...
...
@@ -222,6 +227,7 @@ class QNLI(HFTask):
class
WNLI
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"wnli"
...
...
@@ -271,6 +277,7 @@ class WNLI(HFTask):
class
RTE
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"rte"
...
...
@@ -322,6 +329,7 @@ class RTE(HFTask):
class
MRPC
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"mrpc"
...
...
@@ -374,6 +382,7 @@ class MRPC(HFTask):
class
QQP
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"qqp"
...
...
@@ -426,6 +435,7 @@ class QQP(HFTask):
class
STSB
(
HFTask
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"stsb"
...
...
lm_eval/tasks/headqa.py
View file @
105fa974
...
...
@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask
class
HeadQA
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"head_qa"
DATASET_NAME
=
None
...
...
lm_eval/tasks/hellaswag.py
View file @
105fa974
...
...
@@ -4,6 +4,7 @@ from . common import HFTask
class
HellaSwag
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"hellaswag"
DATASET_NAME
=
None
...
...
lm_eval/tasks/hendrycks_ethics.py
View file @
105fa974
...
...
@@ -85,6 +85,7 @@ class Ethics(Task):
class
EthicsCM
(
Ethics
):
VERSION
=
0
# Ignoring "ambiguous" extra dataset for now
def
get_prefix
(
self
):
return
"commonsense/cm"
...
...
@@ -123,6 +124,7 @@ class EthicsCM(Ethics):
class
EthicsDeontology
(
Ethics
):
VERSION
=
0
def
get_prefix
(
self
):
return
"deontology/deontology"
...
...
@@ -172,6 +174,7 @@ class EthicsDeontology(Ethics):
class
EthicsJustice
(
Ethics
):
VERSION
=
0
def
get_prefix
(
self
):
return
"justice/justice"
...
...
@@ -220,6 +223,7 @@ class EthicsJustice(Ethics):
class
EthicsUtilitarianismOriginal
(
Ethics
):
VERSION
=
0
def
get_prefix
(
self
):
return
"utilitarianism/util"
...
...
@@ -287,6 +291,7 @@ class EthicsUtilitarianismOriginal(Ethics):
class
EthicsUtilitarianism
(
Ethics
):
VERSION
=
0
"""
This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
This allows scaling to >5 shots.
...
...
@@ -339,6 +344,7 @@ class EthicsUtilitarianism(Ethics):
class
EthicsVirtue
(
Ethics
):
VERSION
=
0
def
get_prefix
(
self
):
return
"virtue/virtue"
...
...
lm_eval/tasks/hendrycks_math.py
View file @
105fa974
...
...
@@ -287,35 +287,42 @@ class Math(Task):
class
MathAlgebra
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'algebra'
class
MathCountingAndProbability
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'counting_and_probability'
class
MathGeometry
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'geometry'
class
MathIntermediateAlgebra
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'intermediate_algebra'
class
MathNumberTheory
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'number_theory'
class
MathPrealgebra
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'prealgebra'
class
MathPrecalculus
(
Math
):
VERSION
=
0
def
get_file_info
(
self
):
return
'precalculus'
lm_eval/tasks/hendrycks_test.py
View file @
105fa974
...
...
@@ -34,6 +34,7 @@ def create_task(subject):
class
GeneralHendrycksTest
(
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
Path
(
"data/hendrycksTest/"
)
def
__init__
(
self
,
subject
):
...
...
lm_eval/tasks/lambada.py
View file @
105fa974
...
...
@@ -6,6 +6,7 @@ from best_download import download_file
class
LAMBADA
(
Task
):
VERSION
=
0
def
download
(
self
):
sh
(
"mkdir -p data/lambada"
)
sh
(
"wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl"
)
...
...
lm_eval/tasks/lambada_cloze.py
View file @
105fa974
...
...
@@ -7,6 +7,7 @@ from best_download import download_file
class
LAMBADA_cloze
(
LAMBADA
):
VERSION
=
0
def
doc_to_text
(
self
,
doc
):
return
doc
[
'text'
].
rsplit
(
' '
,
1
)[
0
]
+
" ____. ->"
...
...
lm_eval/tasks/logiqa.py
View file @
105fa974
...
...
@@ -4,6 +4,7 @@ from pathlib import Path
class
LogiQA
(
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
Path
(
"data/logiqa"
)
def
download
(
self
):
...
...
lm_eval/tasks/mathqa.py
View file @
105fa974
...
...
@@ -4,6 +4,7 @@ from . common import HFTask
class
MathQA
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"math_qa"
DATASET_NAME
=
None
...
...
lm_eval/tasks/naturalqs.py
View file @
105fa974
...
...
@@ -4,6 +4,7 @@ from itertools import islice
class
NaturalQs
(
HFTask
):
VERSION
=
0
# TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set.
...
...
lm_eval/tasks/openbookqa.py
View file @
105fa974
...
...
@@ -3,6 +3,7 @@ from .common import HFTask
class
OpenBookQA
(
HFTask
,
MultipleChoiceTask
):
VERSION
=
0
DATASET_PATH
=
"openbookqa"
DATASET_NAME
=
"main"
...
...
lm_eval/tasks/pile.py
View file @
105fa974
...
...
@@ -10,6 +10,7 @@ from best_download import download_file
class
PilePerplexityTask
(
PerplexityTask
,
abc
.
ABC
):
VERSION
=
0
PILE_SET_NAME
=
None
VAL_PATH
=
'data/pile/val.jsonl.zst'
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment