Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2c20cd1f
Unverified
Commit
2c20cd1f
authored
Jul 14, 2023
by
Lintang Sutawika
Committed by
GitHub
Jul 14, 2023
Browse files
Merge pull request #671 from EleutherAI/revamp-process
Revamp process
parents
6862fa7d
0dadc92a
Changes
46
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
106 additions
and
91 deletions
+106
-91
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+4
-4
lm_eval/tasks/qa4mre/qa4mre_2012.yaml
lm_eval/tasks/qa4mre/qa4mre_2012.yaml
+1
-17
lm_eval/tasks/qa4mre/qa4mre_2013.yaml
lm_eval/tasks/qa4mre/qa4mre_2013.yaml
+1
-17
lm_eval/tasks/race/preprocess_race.py
lm_eval/tasks/race/preprocess_race.py
+1
-1
lm_eval/tasks/race/race.yaml
lm_eval/tasks/race/race.yaml
+1
-1
lm_eval/tasks/sciq/sciq.yaml
lm_eval/tasks/sciq/sciq.yaml
+4
-3
lm_eval/tasks/super_glue/boolq/default.yaml
lm_eval/tasks/super_glue/boolq/default.yaml
+6
-9
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+3
-4
lm_eval/tasks/super_glue/cb/default.yaml
lm_eval/tasks/super_glue/cb/default.yaml
+3
-4
lm_eval/tasks/super_glue/copa/default.yaml
lm_eval/tasks/super_glue/copa/default.yaml
+2
-3
lm_eval/tasks/super_glue/copa/utils.py
lm_eval/tasks/super_glue/copa/utils.py
+4
-0
lm_eval/tasks/super_glue/multirc/default.yaml
lm_eval/tasks/super_glue/multirc/default.yaml
+13
-0
lm_eval/tasks/super_glue/record/default.yaml
lm_eval/tasks/super_glue/record/default.yaml
+14
-0
lm_eval/tasks/super_glue/record/promptsource-01.yaml
lm_eval/tasks/super_glue/record/promptsource-01.yaml
+0
-5
lm_eval/tasks/super_glue/record/promptsource-02.yaml
lm_eval/tasks/super_glue/record/promptsource-02.yaml
+0
-5
lm_eval/tasks/super_glue/record/util.py
lm_eval/tasks/super_glue/record/util.py
+15
-0
lm_eval/tasks/super_glue/wic/default.yaml
lm_eval/tasks/super_glue/wic/default.yaml
+3
-4
lm_eval/tasks/super_glue/wic/utils.py
lm_eval/tasks/super_glue/wic/utils.py
+0
-13
lm_eval/tasks/super_glue/wsc/default.yaml
lm_eval/tasks/super_glue/wsc/default.yaml
+13
-0
lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+18
-1
No files found.
lm_eval/tasks/qa4mre/qa4mre_2011.yaml
View file @
2c20cd1f
...
...
@@ -5,10 +5,10 @@ dataset_path: qa4mre
dataset_name
:
2011.main.EN
output_type
:
multiple_choice
test_split
:
train
template_aliases
:
"
{%
set
answer_choices
=
answer_options['answer_str']
%}
"
doc_to_text
:
"
{{document_str.strip()}}
\n
Question:
{{question_str}}
\
n
Choices:
\n
-
{{answer_choices|join('
\n
-
')}}
\
n
Answer:"
doc_to_target
:
!function
preprocess_qa4mre.doc_to_target
gold_alias
:
!function
preprocess_qa4mre.qa4mre_process
# doc_to_text: "{{document_str.strip()}}\nQuestion: {{question_str}}\nChoices:\n- {{answer_choices|join('\n- ')}}\nAnswer:
"
doc_to_text
:
"
{{document_str.strip()}}
\n
Question:
{{question_str}}
\n
Answer:"
doc_to_target
:
"
{{correct_answer_id|int
-
1}}"
doc_to_choice
:
"
{{answer_options.answer_str}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
"
{{document_str.strip()}}
+
'
'
+
{{question_str}}"
metric_list
:
...
...
lm_eval/tasks/qa4mre/qa4mre_2012.yaml
View file @
2c20cd1f
group
:
-
multiple_choice
include
:
qa4mre_2011.yaml
task
:
qa4mre_2012
dataset_path
:
qa4mre
dataset_name
:
2012.main.EN
output_type
:
multiple_choice
test_split
:
train
template_aliases
:
"
{%
set
answer_choices
=
answer_options['answer_str']
%}"
doc_to_text
:
"
{{document_str.strip()}}
\n
Question:
{{question_str}}
\n
Choices:
\n
-
{{answer_choices|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
!function
preprocess_qa4mre.doc_to_target
gold_alias
:
!function
preprocess_qa4mre.qa4mre_process
should_decontaminate
:
true
doc_to_decontamination_query
:
"
{{document_str.strip()}}
+
'
'
+
{{question_str}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/qa4mre/qa4mre_2013.yaml
View file @
2c20cd1f
group
:
-
multiple_choice
include
:
qa4mre_2011.yaml
task
:
qa4mre_2013
dataset_path
:
qa4mre
dataset_name
:
2013.main.EN
output_type
:
multiple_choice
test_split
:
train
template_aliases
:
"
{%
set
answer_choices
=
answer_options['answer_str']
%}"
doc_to_text
:
"
{{document_str.strip()}}
\n
Question:
{{question_str}}
\n
Choices:
\n
-
{{answer_choices|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
!function
preprocess_qa4mre.doc_to_target
gold_alias
:
!function
preprocess_qa4mre.qa4mre_process
should_decontaminate
:
true
doc_to_decontamination_query
:
"
{{document_str.strip()}}
+
'
'
+
{{question_str}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/race/preprocess_race.py
View file @
2c20cd1f
...
...
@@ -15,7 +15,7 @@ def get_answer_option(problem):
return
problem
[
"options"
][
answer
]
def
create
_choice
s
(
doc
):
def
doc_to
_choice
(
doc
):
problem
=
last_problem
(
doc
)
choices
=
[
problem
[
"options"
][
i
]
for
i
in
range
(
4
)]
return
choices
...
...
lm_eval/tasks/race/race.yaml
View file @
2c20cd1f
...
...
@@ -5,9 +5,9 @@ dataset_path: EleutherAI/race
dataset_name
:
high
output_type
:
multiple_choice
test_split
:
test
create_choices
:
!function
preprocess_race.create_choices
doc_to_text
:
!function
preprocess_race.doc_to_text
doc_to_target
:
!function
preprocess_race.doc_to_target
doc_to_choice
:
!function
preprocess_race.doc_to_choice
metric_list
:
-
metric
:
acc
aggregation
:
mean
...
...
lm_eval/tasks/sciq/sciq.yaml
View file @
2c20cd1f
...
...
@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
template_aliases
:
"
{%
set
answer_choices
=
[distractor1,
distractor2,
distractor3,
correct_answer]
%}{%
set
gold
=
3
%}"
# set the list of possible answer choices, and set what this doc's gold label idx is
doc_to_text
:
"
{{support.lstrip()}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{correct_answer}}"
gold_alias
:
"
{{gold}}"
# this will be cast to an int.
doc_to_target
:
3
doc_to_choice
:
"
{{[distractor1,
distractor2,
distractor3,
correct_answer]}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
"
{{support}}
{{question}}"
metric_list
:
-
metric
:
acc
aggregation
:
mean
...
...
lm_eval/tasks/super_glue/boolq/default.yaml
View file @
2c20cd1f
...
...
@@ -6,13 +6,10 @@ dataset_name: boolq
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{answer_choices[label]}}"
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
['no',
'yes']
%}"
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}?
\n
Answer:"
doc_to_target
:
label
doc_to_choice
:
[
"
no"
,
"
yes"
]
should_decontaminate
:
true
doc_to_decontamination_query
:
passage
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
-
metric
:
acc
lm_eval/tasks/super_glue/boolq/seq2seq.yaml
View file @
2c20cd1f
...
...
@@ -6,16 +6,15 @@ dataset_name: boolq
output_type
:
greedy_until
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{
answer_choices
[label]}}"
gold_alias
:
"
{{answer_choices[label]}}"
# this will be cast to an int.
doc_to_text
:
"
{{passage}}
\n
Question:
{{question}}
?
\n
Answer:"
doc_to_target
:
"
{{
['
no',
'
yes']
[label]}}"
target_delimiter
:
"
"
generation_kwargs
:
until
:
-
"
\n\n
"
-
"
\n
"
do_sample
:
false
temperature
:
0.0
template_aliases
:
"
{%
set
answer_choices
=
['no',
'yes']
%}"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
lm_eval/tasks/super_glue/cb/default.yaml
View file @
2c20cd1f
group
:
-
super-glue-lm-eval-v1
task
:
"
cb
"
task
:
cb
dataset_path
:
super_glue
dataset_name
:
cb
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{premise}}
\n
Question:
{{hypothesis}}.
True,
False,
or
Neither?
\n
Answer:"
doc_to_target
:
"
{{answer_choices[label]}}"
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
['True',
'False',
'Neither']
%}"
doc_to_target
:
label
doc_to_choice
:
[
'
True'
,
'
False'
,
'
Neither'
]
metric_list
:
-
metric
:
acc
-
metric
:
f1
...
...
lm_eval/tasks/super_glue/copa/default.yaml
View file @
2c20cd1f
group
:
-
super-glue-lm-eval-v1
task
:
"
copa
"
task
:
copa
dataset_path
:
super_glue
dataset_name
:
copa
output_type
:
multiple_choice
...
...
@@ -8,7 +8,6 @@ training_split: train
validation_split
:
validation
doc_to_text
:
!function
utils.doc_to_text
doc_to_target
:
!function
utils.doc_to_target
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
[{{doc.choice1}},
'b']
%}
{{answer_choices}}"
doc_to_choice
:
!function
utils.doc_to_choice
metric_list
:
-
metric
:
acc
lm_eval/tasks/super_glue/copa/utils.py
View file @
2c20cd1f
...
...
@@ -15,3 +15,7 @@ def doc_to_target(doc):
correct_choice
=
doc
[
"choice1"
]
if
doc
[
"label"
]
==
0
else
doc
[
"choice2"
]
# Connect the sentences
return
" "
+
convert_choice
(
correct_choice
)
def
doc_to_choice
(
doc
):
return
[
" "
+
convert_choice
(
doc
[
"choice1"
]),
" "
+
convert_choice
(
doc
[
"choice2"
])]
lm_eval/tasks/super_glue/multirc/default.yaml
0 → 100644
View file @
2c20cd1f
group
:
-
super-glue-lm-eval-v1
task
:
multirc
dataset_path
:
super_glue
dataset_name
:
multirc
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
"
{{paragraph}}
\n
Question:
{{question}}
\n
Answer:"
doc_to_target
:
label
doc_to_choice
:
"
['''{{answer}}
\\
nIs
the
answer
correct?
yes''',
'''{{answer}}
\\
nIs
the
answer
correct?
no''']"
metric_list
:
-
metric
:
acc
lm_eval/tasks/super_glue/record/default.yaml
0 → 100644
View file @
2c20cd1f
# group:
# - super-glue-lm-eval-v1
task
:
record
dataset_path
:
super_glue
dataset_name
:
record
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
!function
util.doc_to_text
doc_to_target
:
"
{{answers}}"
doc_to_choice
:
"
{{entities}}"
metric_list
:
-
metric
:
f1
-
metric
:
em
lm_eval/tasks/super_glue/record/promptsource-01.yaml
deleted
100644 → 0
View file @
6862fa7d
include
:
promptsource-00.yaml
group
:
-
super-glue-promptsource
task
:
"
Add
sentence
after
after
(continuation
choices)"
use_prompt
:
"
promptsource:Add
sentence
after
after
(continuation
choices)"
lm_eval/tasks/super_glue/record/promptsource-02.yaml
deleted
100644 → 0
View file @
6862fa7d
include
:
promptsource-00.yaml
group
:
-
super-glue-promptsource
task
:
"
Can
you
figure
out…"
use_prompt
:
"
promptsource:Can
you
figure
out…"
lm_eval/tasks/super_glue/record/util.py
0 → 100644
View file @
2c20cd1f
def
doc_to_text
(
doc
):
initial_text
,
*
highlights
=
doc
[
"passage"
].
strip
().
split
(
"
\n
@highlight
\n
"
)
text
=
initial_text
+
"
\n\n
"
for
highlight
in
highlights
:
text
+=
f
" -
{
highlight
}
.
\n
"
return
text
def
format_answer
(
query
,
entity
):
return
f
" -
{
query
}
"
.
replace
(
"@placeholder"
,
entity
)
def
doc_to_target
(
doc
):
# We only output the first correct entity in a doc
return
format_answer
(
query
=
doc
[
"query"
],
entity
=
doc
[
"answers"
][
0
])
lm_eval/tasks/super_glue/wic/default.yaml
View file @
2c20cd1f
...
...
@@ -6,9 +6,8 @@ dataset_name: wic
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
!function
utils.doc_to_text
doc_to_target
:
!function
utils.doc_to_target
gold_alias
:
"
{{label}}"
# this will be cast to an int.
template_aliases
:
"
{%
set
answer_choices
=
['no',
'yes']
%}"
doc_to_text
:
"
Sentence
1:
{{sentence1}}
\n
Sentence
2:
{{sentence2}}
\n
Question:
Is
the
word
'{{sentence1[start1:end1]}}'
used
in
the
same
way
in
the
two
sentences
above?
\n
Answer:"
doc_to_target
:
label
doc_to_choice
:
[
'
no'
,
'
yes'
]
metric_list
:
-
metric
:
acc
lm_eval/tasks/super_glue/wic/utils.py
deleted
100644 → 0
View file @
6862fa7d
def
doc_to_text
(
doc
):
return
(
"Sentence 1: {}
\n
Sentence 2: {}
\n
Question: Is the word '{}' used in the same way in the"
" two sentences above?
\n
Answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence1"
][
doc
[
"start1"
]
:
doc
[
"end1"
]],
)
)
def
doc_to_target
(
doc
):
return
" {}"
.
format
({
0
:
"no"
,
1
:
"yes"
}[
doc
[
"label"
]])
lm_eval/tasks/super_glue/wsc/default.yaml
0 → 100644
View file @
2c20cd1f
group
:
-
super-glue-lm-eval-v1
task
:
wsc
dataset_path
:
super_glue
dataset_name
:
wsc
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
doc_to_text
:
!function
preprocess_wsc.default_doc_to_text
doc_to_target
:
label
doc_to_choice
:
[
'
no'
,
'
yes'
]
metric_list
:
-
metric
:
acc
lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
View file @
2c20cd1f
import
re
from
lm_eval.utils
import
general_detokenize
def
doc_to_text
(
x
):
def
t5_prompt_
doc_to_text
(
x
):
def
_mark_span
(
text
,
span_str
,
span_idx
,
mark
):
pattern_tmpl
=
r
"^((?:\S+\s){N})(W)"
pattern
=
re
.
sub
(
"N"
,
str
(
span_idx
),
pattern_tmpl
)
...
...
@@ -15,3 +16,19 @@ def doc_to_text(x):
text
=
_mark_span
(
text
,
x
[
"span2_text"
],
span2_index
,
"#"
)
return
text
def
default_doc_to_text
(
doc
):
raw_passage
=
doc
[
"text"
]
# NOTE: HuggingFace span indices are word-based not character-based.
pre
=
" "
.
join
(
raw_passage
.
split
()[:
doc
[
"span2_index"
]])
post
=
raw_passage
[
len
(
pre
)
+
len
(
doc
[
"span2_text"
])
+
1
:]
passage
=
general_detokenize
(
pre
+
" *{}*"
.
format
(
doc
[
"span2_text"
])
+
post
)
noun
=
doc
[
"span1_text"
]
pronoun
=
doc
[
"span2_text"
]
text
=
(
f
"Passage:
{
passage
}
\n
"
+
f
'Question: In the passage above, does the pronoun "*
{
pronoun
}
*" refer to "*
{
noun
}
*"?
\n
'
+
"Answer:"
)
return
text
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment