Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2bfa4518
"torchvision/vscode:/vscode.git/clone" did not exist on "3130b457934124ffc7e9bdb6b2d86efa9a8c71cf"
Commit
2bfa4518
authored
Apr 25, 2022
by
jon-tow
Browse files
Fix prompt source rank choice accuracy
parent
9f388461
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
69 additions
and
110 deletions
+69
-110
lm_eval/base.py
lm_eval/base.py
+15
-13
lm_eval/evaluator.py
lm_eval/evaluator.py
+3
-6
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+1
-2
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+14
-53
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+35
-35
scripts/write_out.py
scripts/write_out.py
+1
-1
No files found.
lm_eval/base.py
View file @
2bfa4518
...
...
@@ -644,7 +644,6 @@ class PromptSourceTask(Task):
return
f
"
{
target
}
"
def
doc_to_text
(
self
,
doc
):
print
(
doc
)
text
,
_
=
self
.
prompt
.
apply
(
doc
)
return
text
...
...
@@ -661,13 +660,14 @@ class PromptSourceTask(Task):
"""
_requests
=
[]
if
self
.
prompt
.
metadata
.
choices_in_prompt
:
for
answer_choice
in
self
.
prompt
.
get_fixed_answer_choices_list
():
answer_choices_list
=
self
.
prompt
.
get_answer_choices_list
(
doc
)
if
answer_choices_list
:
for
answer_choice
in
answer_choices_list
:
ll_answer_choice
,
_
=
rf
.
loglikelihood
(
ctx
,
f
"
{
answer_choice
}
"
)
_requests
.
append
(
ll_answer_choice
)
else
:
# TODO(Albert): What is the stop symbol? Is it model specific?
ll_greedy
,
_
=
rf
.
greedy_until
(
ctx
,
[
"
\n
Q:"
])
ll_greedy
=
rf
.
greedy_until
(
ctx
,
[
"
\n
Q:"
])
_requests
.
append
(
ll_greedy
)
return
_requests
...
...
@@ -682,20 +682,22 @@ class PromptSourceTask(Task):
:param results:
The results of the requests created in construct_requests.
"""
raise
NotImplementedError
(
"Implement process results using the `prompt.metadata.metrics`. See below."
)
if
self
.
prompt
.
metadata
.
choices_in_prompt
:
for
result
,
answer_choice
in
zip
(
prompt
.
get_fixed_answer_choices_list
(),
results
):
pass
# raise NotImplementedError(
# "Implement process results using the `prompt.metadata.metrics`. See below."
# )
target
=
self
.
doc_to_target
(
doc
).
strip
()
answer_choices_list
=
self
.
prompt
.
get_answer_choices_list
(
doc
)
if
answer_choices_list
:
pred
=
answer_choices_list
[
np
.
argmax
(
results
)]
return
{
"acc"
:
pred
==
target
}
else
:
continuation
=
results
# Map metric name to HF metric.
# TODO(Albert): What is Other?
metric_names
=
prompt
.
metadata
.
metrics
#
metric_names = prompt.metadata.metrics
class
MultipleChoiceTask
(
Task
):
...
...
lm_eval/evaluator.py
View file @
2bfa4518
...
...
@@ -241,15 +241,12 @@ def evaluate(
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_prompt_name
,
metric
)].
append
(
value
)
# aggregate results
for
(
task_prompt_name
,
metric
),
items
in
vals
.
items
():
task_name
,
prompt_name
=
task_prompt_name
.
split
(
"+"
)
results
[
task_prompt_name
][
"task_name"
]
=
task_name
results
[
task_prompt_name
][
"prompt_name"
]
=
prompt_name
task
=
task_dict
[
task_name
]
task
=
task_dict
[
task_prompt_name
]
results
[
task_prompt_name
][
metric
]
=
task
.
aggregation
()[
metric
](
items
)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
...
...
@@ -276,13 +273,13 @@ def make_table(result_dict):
latex_writer
.
headers
=
[
"Task"
,
"Version"
,
"Metric"
,
"Value"
,
""
,
"Stderr"
]
values
=
[]
for
k
,
dic
in
result_dict
[
"results"
].
items
():
version
=
result_dict
[
"versions"
][
k
]
for
m
,
v
in
dic
.
items
():
if
m
.
endswith
(
"_stderr"
):
continue
if
"_name"
in
m
:
continue
if
m
+
"_stderr"
in
dic
:
se
=
dic
[
m
+
"_stderr"
]
values
.
append
([
k
,
version
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
...
...
lm_eval/tasks/coqa.py
View file @
2bfa4518
...
...
@@ -30,7 +30,7 @@ _CITATION = """
class
CoQA
(
PromptSourceTask
):
VERSION
=
1
DATASET_PATH
=
inspect
.
getfile
(
lm_eval
.
datasets
.
coqa
.
coqa
)
DATASET_PATH
=
"coqa"
DATASET_NAME
=
None
def
has_training_docs
(
self
):
...
...
@@ -57,7 +57,6 @@ class CoQA(PromptSourceTask):
# answers = []
# answer_forturn = doc["answers"]["input_text"][turn_id - 1]
# answers.append(answer_forturn)
# additional_answers = doc.get("additional_answers")
# if additional_answers:
# for key in additional_answers:
...
...
lm_eval/tasks/glue.py
View file @
2bfa4518
...
...
@@ -14,7 +14,7 @@ respect to a wide range of linguistic phenomena found in natural language.
Homepage: https://gluebenchmark.com/
"""
import
numpy
as
np
from
lm_eval.base
import
rf
,
Task
from
lm_eval.base
import
PromptSourceTask
,
rf
,
Task
from
lm_eval.metrics
import
mean
,
matthews_corrcoef
,
f1_score
,
yesno
from
lm_eval.utils
import
general_detokenize
...
...
@@ -286,7 +286,7 @@ class QNLI(Task):
}
class
WNLI
(
Task
):
class
WNLI
(
PromptSource
Task
):
VERSION
=
1
DATASET_PATH
=
"glue"
DATASET_NAME
=
"wnli"
...
...
@@ -301,37 +301,14 @@ class WNLI(Task):
return
False
def
training_docs
(
self
):
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
list
(
self
.
dataset
[
"train"
])
return
self
.
_training_docs
# if self._training_docs is None:
# self._training_docs = list()
# return self._training_docs
return
self
.
dataset
[
"train"
]
def
validation_docs
(
self
):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: {} True or False?
\n
Answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
def
doc_to_target
(
self
,
doc
):
# True = entailment
# False = not_entailment
return
" {}"
.
format
({
0
:
"False"
,
1
:
"True"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_false
def
process_results
(
self
,
doc
,
results
):
ll_true
,
ll_false
=
results
pred
=
ll_true
>
ll_false
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
...
...
@@ -343,7 +320,7 @@ class WNLI(Task):
}
class
RTE
(
Task
):
class
RTE
(
PromptSource
Task
):
VERSION
=
0
DATASET_PATH
=
"glue"
DATASET_NAME
=
"rte"
...
...
@@ -365,29 +342,13 @@ class RTE(Task):
def
validation_docs
(
self
):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: {} True or False?
\n
Answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
def
doc_to_target
(
self
,
doc
):
# 0 = entailment
# 1 = not_entailment
return
" {}"
.
format
({
0
:
"True"
,
1
:
"False"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_false
def
process_results
(
self
,
doc
,
results
):
ll_true
,
ll_false
=
results
pred
=
ll_false
>
ll_true
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
# def process_results(self, doc, results):
# ll_true, ll_false = results
# pred = ll_false > ll_true
# gold = doc["label"]
# return {
# "acc": pred == gold
# }
def
higher_is_better
(
self
):
return
{
...
...
lm_eval/tasks/race.py
View file @
2bfa4518
...
...
@@ -51,47 +51,47 @@ class RACE(PromptSourceTask):
def
has_test_docs
(
self
):
return
True
def
_collate_data
(
self
,
set
):
if
set
in
self
.
cache
:
return
self
.
cache
[
set
]
# One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage.
r
=
collections
.
defaultdict
(
list
)
for
item
in
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)[
set
]:
r
[
item
[
"article"
]].
append
(
item
)
res
=
list
(
r
.
values
()
>>
each
(
lambda
x
:
{
"article"
:
x
[
0
][
"article"
],
"problems"
:
x
>>
each
(
lambda
y
:
{
"question"
:
y
[
"question"
],
"answer"
:
y
[
"answer"
],
"options"
:
y
[
"options"
],
}
),
}
)
)
self
.
cache
[
set
]
=
res
return
res
#
def _collate_data(self, set):
#
if set in self.cache:
#
return self.cache[set]
#
# One big issue with HF's implementation of this dataset: it makes a
#
# separate document for each question; meanwhile, in the GPT3 paper it
#
# is shown that one document is made per passage.
#
r = collections.defaultdict(list)
#
for item in datasets.load_dataset(
#
path=self.DATASET_PATH, name=self.DATASET_NAME
#
)[set]:
#
r[item["article"]].append(item)
#
res = list(
#
r.values()
#
>> each(
#
lambda x: {
#
"article": x[0]["article"],
#
"problems": x
#
>> each(
#
lambda y: {
#
"question": y["question"],
#
"answer": y["answer"],
#
"options": y["options"],
#
}
#
),
#
}
#
)
#
)
#
self.cache[set] = res
#
return res
def
training_docs
(
self
):
return
self
.
_collate_data
(
"train"
)
return
self
.
dataset
[
"train"
]
def
validation_docs
(
self
):
return
self
.
_collate_data
(
"validation"
)
return
self
.
dataset
[
"validation"
]
def
test_docs
(
self
):
return
self
.
_collate_data
(
"test"
)
return
self
.
dataset
[
"test"
]
@
classmethod
def
get_answer_option
(
cls
,
problem
):
...
...
scripts/write_out.py
View file @
2bfa4518
...
...
@@ -30,7 +30,7 @@ def main():
task_names
=
tasks
.
ALL_TASKS
else
:
task_names
=
args
.
tasks
.
split
(
","
)
task_dict
=
tasks
.
get_task_dict
(
task_names
)
task_dict
=
tasks
.
get_task_dict
_promptsource
(
task_names
)
description_dict
=
{}
if
args
.
description_dict_path
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment