Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
9822b06e
Unverified
Commit
9822b06e
authored
Mar 01, 2024
by
Lintang Sutawika
Committed by
GitHub
Mar 01, 2024
Browse files
Merge branch 'main' into weight_by_size
parents
51f27158
b177c82c
Changes
656
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
223 additions
and
5 deletions
+223
-5
lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
+16
-1
lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
+11
-1
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+1
-0
lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+9
-3
lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml
lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml
+11
-0
lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml
lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml
+11
-0
lm_eval/tasks/bbh/zeroshot/date_understanding.yaml
lm_eval/tasks/bbh/zeroshot/date_understanding.yaml
+13
-0
lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml
lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml
+13
-0
lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml
lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml
+10
-0
lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml
lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml
+11
-0
lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml
lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml
+13
-0
lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml
lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml
+13
-0
lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml
...al/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml
+12
-0
lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml
...l/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml
+12
-0
lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml
...l/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml
+12
-0
lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml
lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml
+12
-0
lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml
lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml
+11
-0
lm_eval/tasks/bbh/zeroshot/navigate.yaml
lm_eval/tasks/bbh/zeroshot/navigate.yaml
+10
-0
lm_eval/tasks/bbh/zeroshot/object_counting.yaml
lm_eval/tasks/bbh/zeroshot/object_counting.yaml
+10
-0
lm_eval/tasks/bbh/zeroshot/penguins_in_a_table.yaml
lm_eval/tasks/bbh/zeroshot/penguins_in_a_table.yaml
+12
-0
No files found.
lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
View file @
9822b06e
"
dataset_name"
:
"
web_of_lies"
"
dataset_name"
:
"
web_of_lies"
"
description"
:
"
Evaluate
a
random
boolean
function
expressed
as
a
word
problem.
\n\n
"
"
description"
:
"
Evaluate
a
random
boolean
function
expressed
as
a
word
problem.
\n\n
"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:
Let's
think
step
by
step.
\n
"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:
Let's
think
step
by
step."
"
include"
:
"
_cot_zeroshot_template_yaml"
"
include"
:
"
_cot_zeroshot_template_yaml"
"
task"
:
"
bbh_cot_zeroshot_web_of_lies"
"
task"
:
"
bbh_cot_zeroshot_web_of_lies"
filter_list
:
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MapRegexFilter
group_select
:
-1
ignore_case
:
true
regex_pattern_to_value
:
\b(no|does not tell the truth|is not telling the truth)\b
:
"
no"
\b(yes|tells the truth|is telling the truth)\b
:
"
yes"
-
function
:
"
take_first"
-
name
:
"
strict-match"
filter
:
-
function
:
"
regex"
regex_pattern
:
"
((?<=The
answer
is
)(.*)(?=.)|(?<=the
answer
is
)(.*)(?=.)|(?<=The
answer:
)(.*)(?=.)|(?<=The
final
answer:
)(.*)(?=.))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
View file @
9822b06e
"
dataset_name"
:
"
word_sorting"
"
dataset_name"
:
"
word_sorting"
"
description"
:
"
Sort
a
list
of
words.
\n\n
"
"
description"
:
"
Sort
a
list
of
words.
\n\n
"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:
Let's
think
step
by
step.
\n
"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:
Let's
think
step
by
step."
"
include"
:
"
_cot_zeroshot_template_yaml"
"
include"
:
"
_cot_zeroshot_template_yaml"
"
task"
:
"
bbh_cot_zeroshot_word_sorting"
"
task"
:
"
bbh_cot_zeroshot_word_sorting"
filter_list
:
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.WordSortFilter
-
function
:
"
take_first"
-
name
:
"
strict-match"
filter
:
-
function
:
"
regex"
regex_pattern
:
"
((?<=The
answer
is
)(.*)(?=.)|(?<=the
answer
is
)(.*)(?=.)|(?<=The
answer:
)(.*)(?=.)|(?<=The
final
answer:
)(.*)(?=.))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
View file @
9822b06e
...
@@ -19,3 +19,4 @@ generation_kwargs:
...
@@ -19,3 +19,4 @@ generation_kwargs:
num_fewshot: 0
num_fewshot: 0
metadata:
metadata:
version: 1.0
version: 1.0
num_fewshot: 3 # will be printed in results table
lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
View file @
9822b06e
...
@@ -7,15 +7,21 @@ metric_list:
...
@@ -7,15 +7,21 @@ metric_list:
- metric: exact_match
- metric: exact_match
aggregation: mean
aggregation: mean
higher_is_better: true
higher_is_better: true
#
ignore_case: true
ignore_case: true
# ignore_punctuation: true
# ignore_punctuation: true
regexes_to_ignore:
- "\\.$"
- ","
- "\n"
- "\\\\"
- '"'
generation_kwargs:
generation_kwargs:
until:
until:
- "</s>"
- "</s>"
- "Q:"
- "Q:"
- "
\n\n
"
- "
<|im_end|>
"
do_sample: false
do_sample: false
temperature: 0.0
temperature: 0.0
num_fewshot: 0
num_fewshot: 0
metadata:
metadata:
version:
1
.0
version:
2
.0
lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml
View file @
9822b06e
...
@@ -3,3 +3,14 @@
...
@@ -3,3 +3,14 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_boolean_expressions"
"
task"
:
"
bbh_zeroshot_boolean_expressions"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
\\
b(True|False)
\\
b"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml
View file @
9822b06e
...
@@ -3,3 +3,14 @@
...
@@ -3,3 +3,14 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_causal_judgement"
"
task"
:
"
bbh_zeroshot_causal_judgement"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
\\
b(Yes|No|yes|no)
\\
b"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/date_understanding.yaml
View file @
9822b06e
...
@@ -3,3 +3,16 @@
...
@@ -3,3 +3,16 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_date_understanding"
"
task"
:
"
bbh_zeroshot_date_understanding"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml
View file @
9822b06e
...
@@ -3,3 +3,16 @@
...
@@ -3,3 +3,16 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_disambiguation_qa"
"
task"
:
"
bbh_zeroshot_disambiguation_qa"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml
View file @
9822b06e
...
@@ -3,3 +3,13 @@
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_dyck_languages"
"
task"
:
"
bbh_zeroshot_dyck_languages"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
(?<=
)([
\"
\\
[
\\
(<{}>
\\
)
\\
]]+)|([
\"
\\
[
\\
(<{}>
\\
)
\\
]]+)"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml
View file @
9822b06e
...
@@ -3,3 +3,14 @@
...
@@ -3,3 +3,14 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_formal_fallacies"
"
task"
:
"
bbh_zeroshot_formal_fallacies"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
\\
b(valid|invalid)
\\
b"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml
View file @
9822b06e
...
@@ -3,3 +3,16 @@
...
@@ -3,3 +3,16 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_geometric_shapes"
"
task"
:
"
bbh_zeroshot_geometric_shapes"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml
View file @
9822b06e
...
@@ -3,3 +3,16 @@
...
@@ -3,3 +3,16 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_hyperbaton"
"
task"
:
"
bbh_zeroshot_hyperbaton"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml
View file @
9822b06e
...
@@ -3,3 +3,15 @@
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_logical_deduction_five_objects"
"
task"
:
"
bbh_zeroshot_logical_deduction_five_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml
View file @
9822b06e
...
@@ -3,3 +3,15 @@
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_logical_deduction_seven_objects"
"
task"
:
"
bbh_zeroshot_logical_deduction_seven_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml
View file @
9822b06e
...
@@ -3,3 +3,15 @@
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_logical_deduction_three_objects"
"
task"
:
"
bbh_zeroshot_logical_deduction_three_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml
View file @
9822b06e
...
@@ -3,3 +3,15 @@
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_movie_recommendation"
"
task"
:
"
bbh_zeroshot_movie_recommendation"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml
View file @
9822b06e
...
@@ -3,3 +3,14 @@
...
@@ -3,3 +3,14 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_multistep_arithmetic_two"
"
task"
:
"
bbh_zeroshot_multistep_arithmetic_two"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.NumberParseRegexFilter
group_select
:
0
regex_pattern
:
"
([-0-9]+)"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/navigate.yaml
View file @
9822b06e
...
@@ -3,3 +3,13 @@
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_navigate"
"
task"
:
"
bbh_zeroshot_navigate"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
\\
b(Yes|No|yes|no)
\\
b"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/object_counting.yaml
View file @
9822b06e
...
@@ -3,3 +3,13 @@
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_object_counting"
"
task"
:
"
bbh_zeroshot_object_counting"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.NumberParseRegexFilter
group_select
:
0
regex_pattern
:
"
([-0-9]+)"
-
function
:
"
take_first"
lm_eval/tasks/bbh/zeroshot/penguins_in_a_table.yaml
View file @
9822b06e
...
@@ -3,3 +3,15 @@
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
doc_to_text"
:
"
Q:
{{input}}
\n
A:"
"
include"
:
"
_zeroshot_template_yaml"
"
include"
:
"
_zeroshot_template_yaml"
"
task"
:
"
bbh_zeroshot_penguins_in_a_table"
"
task"
:
"
bbh_zeroshot_penguins_in_a_table"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
Prev
1
…
3
4
5
6
7
8
9
10
11
…
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment