Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
8ac99269
Commit
8ac99269
authored
Oct 30, 2021
by
Jonathan Tow
Browse files
Replace the `fewshot_description` API with a `description_dict` based interface
parent
b67aec37
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
27 additions
and
101 deletions
+27
-101
lm_eval/base.py
lm_eval/base.py
+10
-8
lm_eval/evaluator.py
lm_eval/evaluator.py
+16
-10
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+0
-4
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+0
-4
lm_eval/tasks/cbt.py
lm_eval/tasks/cbt.py
+0
-4
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+1
-4
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+0
-4
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+0
-17
lm_eval/tasks/headqa.py
lm_eval/tasks/headqa.py
+0
-4
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hellaswag.py
+0
-5
lm_eval/tasks/hendrycks_math.py
lm_eval/tasks/hendrycks_math.py
+0
-3
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+0
-4
lm_eval/tasks/lambada_cloze.py
lm_eval/tasks/lambada_cloze.py
+0
-3
lm_eval/tasks/logiqa.py
lm_eval/tasks/logiqa.py
+0
-4
lm_eval/tasks/mathqa.py
lm_eval/tasks/mathqa.py
+0
-4
lm_eval/tasks/mc_taco.py
lm_eval/tasks/mc_taco.py
+0
-3
lm_eval/tasks/mutual.py
lm_eval/tasks/mutual.py
+0
-4
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+0
-4
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+0
-4
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+0
-4
No files found.
lm_eval/base.py
View file @
8ac99269
...
...
@@ -2,6 +2,7 @@ import abc
import
random
import
numpy
as
np
import
re
from
lm_eval
import
tasks
from
lm_eval.metrics
import
mean
,
perplexity
,
weighted_perplexity
,
weighted_mean
...
...
@@ -224,11 +225,15 @@ class Task(abc.ABC):
pass
def
fewshot_description
(
self
):
import
warnings
warnings
.
warn
(
"`fewshot_description` will be removed in coming versions. Pass "
\
"any custom descriptions to the `evaluate` function instead."
,
DeprecationWarning
)
return
""
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
,
rnd
):
raw_description
=
self
.
fewshot_description
()
description
=
(
raw_description
+
"
\n
===
\n\n
"
)
if
provide_description
and
raw_description
else
""
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
,
description
=
None
):
description
=
description
+
"
\n\n
"
if
description
else
""
if
num_fewshot
==
0
:
labeled_examples
=
""
...
...
@@ -295,16 +300,13 @@ class PerplexityTask(Task, abc.ABC):
def
has_training_docs
(
self
):
return
False
def
fewshot_description
(
self
):
return
""
def
fewshot_examples
(
self
,
k
,
rnd
):
assert
k
==
0
return
[]
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_
description
,
rnd
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
,
description
=
None
):
assert
num_fewshot
==
0
assert
not
provide_
description
assert
description
is
None
return
""
def
higher_is_better
(
self
):
...
...
lm_eval/evaluator.py
View file @
8ac99269
import
collections
import
itertools
import
json
import
random
import
lm_eval.metrics
import
lm_eval.models
...
...
@@ -7,7 +8,7 @@ import lm_eval.tasks
import
lm_eval.base
import
numpy
as
np
def
simple_evaluate
(
model
,
model_args
,
task_names
,
num_fewshot
=
0
,
batch_size
=
None
,
device
=
None
,
no_cache
=
False
,
limit
=
None
,
bootstrap_iters
=
100000
):
def
simple_evaluate
(
model
,
model_args
,
task_names
,
description_path
=
None
,
num_fewshot
=
0
,
batch_size
=
None
,
device
=
None
,
no_cache
=
False
,
limit
=
None
,
bootstrap_iters
=
100000
):
random
.
seed
(
1234
)
np
.
random
.
seed
(
1234
)
...
...
@@ -19,7 +20,12 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
lm
=
lm_eval
.
base
.
CachingLM
(
lm
,
'lm_cache/'
+
model
+
'_'
+
model_args
.
replace
(
'='
,
'-'
).
replace
(
','
,
'_'
).
replace
(
'/'
,
'-'
)
+
'.db'
)
task_dict
=
lm_eval
.
tasks
.
get_task_dict
(
task_names
)
results
=
evaluate
(
lm
,
task_dict
,
False
,
num_fewshot
,
limit
)
description_dict
=
{}
if
description_path
:
with
open
(
description_path
,
'r'
)
as
f
:
description_dict
=
json
.
load
(
f
)
results
=
evaluate
(
lm
,
task_dict
,
num_fewshot
,
limit
,
description_dict
)
# add info about the model and few shot config
results
[
"config"
]
=
{
...
...
@@ -28,6 +34,8 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
"num_fewshot"
:
num_fewshot
,
"batch_size"
:
batch_size
,
"device"
:
device
,
# TODO (jon-tow): Should we add the description info to `results["config"]`?
# "description_dict": description_dict,
"no_cache"
:
no_cache
,
"limit"
:
limit
,
"bootstrap_iters"
:
bootstrap_iters
...
...
@@ -36,9 +44,7 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
return
results
def
evaluate
(
lm
,
task_dict
,
provide_description
,
num_fewshot
,
limit
,
bootstrap_iters
=
100000
):
assert
not
provide_description
# not implemented. todo: implement proper description-providing system
def
evaluate
(
lm
,
task_dict
,
num_fewshot
,
limit
,
description_dict
=
None
,
bootstrap_iters
=
100000
):
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items
=
[(
name
,
task
)
for
name
,
task
in
task_dict
.
items
()
if
(
task
.
has_validation_docs
()
or
task
.
has_test_docs
())]
...
...
@@ -73,16 +79,16 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
rnd
.
seed
(
42
)
rnd
.
shuffle
(
task_docs
)
description
=
description_dict
[
task_name
]
if
description_dict
and
task_name
in
description_dict
else
""
for
doc_id
,
doc
in
enumerate
(
itertools
.
islice
(
task_docs
,
0
,
limit
)):
docs
[(
task_name
,
doc_id
)]
=
doc
ctx
=
task
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
rnd
=
rnd
rnd
=
rnd
,
description
=
description
)
reqs
=
task
.
construct_requests
(
doc
,
ctx
)
if
not
isinstance
(
reqs
,
(
list
,
tuple
)):
reqs
=
[
reqs
]
for
i
,
req
in
enumerate
(
reqs
):
...
...
@@ -168,4 +174,4 @@ def make_table(result_dict):
# todo: make latex table look good
# print(latex_writer.dumps())
return
md_writer
.
dumps
()
\ No newline at end of file
return
md_writer
.
dumps
()
lm_eval/tasks/anli.py
View file @
8ac99269
...
...
@@ -33,10 +33,6 @@ class ANLIBase(HFTask):
if
self
.
has_test_docs
():
return
self
.
data
[
"test_r"
+
str
(
self
.
SPLIT
)]
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
# OA does this a bit weirdly: they prepend "anli 1: anli 1: " to the beginning
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
...
...
lm_eval/tasks/arc.py
View file @
8ac99269
...
...
@@ -29,10 +29,6 @@ class ARCEasy(HFTask, MultipleChoiceTask):
}
return
out_doc
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
...
...
lm_eval/tasks/cbt.py
View file @
8ac99269
...
...
@@ -17,10 +17,6 @@ class CBTBase(HFTask):
VERSION
=
0
def
fewshot_description
(
self
):
# TODO: Figure out description.
return
""
def
detokenize
(
self
,
text
):
text
=
text
.
replace
(
" '"
,
"'"
)
text
=
text
.
replace
(
"
\n
"
,
"
\n
"
)
...
...
lm_eval/tasks/coqa.py
View file @
8ac99269
...
...
@@ -36,10 +36,7 @@ class CoQA(Task):
def
test_docs
(
self
):
pass
def
fewshot_description
(
self
):
return
"Given a passage and a conversation so far, answer the next question in the conversation."
def
doc_to_text
(
self
,
doc
):
# Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# and a question qi, the task is to predict the answer ai
...
...
lm_eval/tasks/drop.py
View file @
8ac99269
...
...
@@ -40,10 +40,6 @@ class DROP(Task):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
_load_docs
(
self
,
docs
):
for
doc
in
docs
:
for
qa
in
doc
[
"qa_pairs"
]:
...
...
lm_eval/tasks/glue.py
View file @
8ac99269
...
...
@@ -21,10 +21,6 @@ class CoLA(HFTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
# TODO
return
""
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: Does this sentence make sense?
\n
Answer:"
.
format
(
doc
[
"sentence"
])
...
...
@@ -69,9 +65,6 @@ class SST(HFTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
return
"Indicate if the sentiment of each sentence is positive or negative."
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: Is this sentence positive or negative?
\n
Answer:"
.
format
(
general_detokenize
(
doc
[
"sentence"
]),
...
...
@@ -342,9 +335,6 @@ class MRPC(HFTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
):
return
"Sentence 1: {}
\n
Sentence 2: {}
\n
Question: Do both sentences mean the same thing?
\n
Answer:"
.
format
(
general_detokenize
(
doc
[
"sentence1"
]),
...
...
@@ -395,9 +385,6 @@ class QQP(HFTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
return
"Indicate if both questions ask the same thing."
def
doc_to_text
(
self
,
doc
):
return
"Question 1: {}
\n
Question 2: {}
\n
Question: Do both questions ask the same thing?
\n
Answer:"
.
format
(
doc
[
"question1"
],
...
...
@@ -448,10 +435,6 @@ class STSB(HFTask):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing from a scale of 0-5, "
\
"where 5 means identical and 0 means unrelated."
def
doc_to_text
(
self
,
doc
):
return
"sentence 1: {}
\n
sentence 2: {}
\n
Answer:"
.
format
(
doc
[
"sentence1"
],
...
...
lm_eval/tasks/headqa.py
View file @
8ac99269
...
...
@@ -25,9 +25,5 @@ class HeadQA(HFTask, MultipleChoiceTask):
}
return
out_doc
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/hellaswag.py
View file @
8ac99269
...
...
@@ -35,10 +35,5 @@ class HellaSwag(HFTask, MultipleChoiceTask):
}
return
out_doc
def
fewshot_description
(
self
):
return
"Label for the relevant action: Sentences describing the "
\
"context, with an incomplete sentence trailing
\n
answer that "
\
"plausibly completes the situation."
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/hendrycks_math.py
View file @
8ac99269
...
...
@@ -55,9 +55,6 @@ class Math(Task):
def
test_docs
(
self
):
return
self
.
_load_docs
(
self
.
DATASET_PATH
/
"test"
/
self
.
get_file_info
())
def
fewshot_description
(
self
):
return
"Given a mathematics problem, determine the answer. Simplify your answer as much as possible."
def
doc_to_text
(
self
,
doc
):
return
"Problem: "
+
doc
[
"problem"
]
+
"
\n
Answer:"
...
...
lm_eval/tasks/lambada.py
View file @
8ac99269
...
...
@@ -47,10 +47,6 @@ class LAMBADA(Task):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'text'
].
rsplit
(
' '
,
1
)[
1
]
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
construct_requests
(
self
,
doc
,
ctx
):
ll
,
is_greedy
=
rf
.
loglikelihood
(
ctx
,
self
.
doc_to_target
(
doc
))
...
...
lm_eval/tasks/lambada_cloze.py
View file @
8ac99269
...
...
@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'text'
].
rsplit
(
' '
,
1
)[
1
]
def
fewshot_description
(
self
):
return
"Fill in blank:
\n
"
lm_eval/tasks/logiqa.py
View file @
8ac99269
...
...
@@ -80,9 +80,5 @@ class LogiQA(MultipleChoiceTask):
def
test_docs
(
self
):
return
self
.
_load_docs
(
self
.
DATASET_PATH
/
"Test.txt"
)
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
""
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/mathqa.py
View file @
8ac99269
...
...
@@ -29,9 +29,5 @@ class MathQA(HFTask, MultipleChoiceTask):
}
return
out_doc
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/mc_taco.py
View file @
8ac99269
...
...
@@ -39,9 +39,6 @@ class MCTACO(HFTask):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Determine whether the candidate answer is plausible (
\"
yes
\"
) or not (
\"
no
\"
)"
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'sentence'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
"
\
f
"Answer:
{
doc
[
'answer'
]
}
\n
Plausible:"
...
...
lm_eval/tasks/mutual.py
View file @
8ac99269
...
...
@@ -70,10 +70,6 @@ class MuTualBase(Task):
def
test_docs
(
self
):
return
NotImplemented
def
fewshot_description
(
self
):
# TODO: figure out fewshot description
return
""
def
doc_to_text
(
self
,
doc
):
return
self
.
detokenize
(
doc
[
"article"
])
...
...
lm_eval/tasks/naturalqs.py
View file @
8ac99269
...
...
@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
training_docs
(
self
):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
...
...
lm_eval/tasks/openbookqa.py
View file @
8ac99269
...
...
@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
}
return
out_doc
def
fewshot_description
(
self
):
# TODO: figure out fewshot description
return
""
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/piqa.py
View file @
8ac99269
...
...
@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
def
has_test_docs
(
self
):
return
False
def
fewshot_description
(
self
):
# TODO: figure out fewshot description
return
""
def
_convert_standard
(
self
,
doc
):
out_doc
=
{
"goal"
:
doc
[
"goal"
],
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment