Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
52f75f0e
Commit
52f75f0e
authored
Nov 28, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into versioning
parents
331d7c51
b072bb0d
Changes
72
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
37 additions
and
20 deletions
+37
-20
lm_eval/api/task.py
lm_eval/api/task.py
+1
-1
lm_eval/evaluator.py
lm_eval/evaluator.py
+18
-2
lm_eval/tasks/blimp/_template_yaml
lm_eval/tasks/blimp/_template_yaml
+1
-0
lm_eval/tasks/blimp/adjunct_island.yaml
lm_eval/tasks/blimp/adjunct_island.yaml
+1
-1
lm_eval/tasks/blimp/anaphor_gender_agreement.yaml
lm_eval/tasks/blimp/anaphor_gender_agreement.yaml
+1
-1
lm_eval/tasks/blimp/anaphor_number_agreement.yaml
lm_eval/tasks/blimp/anaphor_number_agreement.yaml
+1
-1
lm_eval/tasks/blimp/animate_subject_passive.yaml
lm_eval/tasks/blimp/animate_subject_passive.yaml
+1
-1
lm_eval/tasks/blimp/animate_subject_trans.yaml
lm_eval/tasks/blimp/animate_subject_trans.yaml
+1
-1
lm_eval/tasks/blimp/causative.yaml
lm_eval/tasks/blimp/causative.yaml
+1
-1
lm_eval/tasks/blimp/complex_NP_island.yaml
lm_eval/tasks/blimp/complex_NP_island.yaml
+1
-1
lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml
.../coordinate_structure_constraint_complex_left_branch.yaml
+1
-1
lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml
...mp/coordinate_structure_constraint_object_extraction.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml
lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml
lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml
...al/tasks/blimp/determiner_noun_agreement_irregular_1.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml
...al/tasks/blimp/determiner_noun_agreement_irregular_2.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml
...val/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml
...blimp/determiner_noun_agreement_with_adj_irregular_1.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml
...blimp/determiner_noun_agreement_with_adj_irregular_2.yaml
+1
-1
lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml
...sks/blimp/determiner_noun_agreement_with_adjective_1.yaml
+1
-1
No files found.
lm_eval/api/task.py
View file @
52f75f0e
...
...
@@ -81,7 +81,7 @@ class TaskConfig(dict):
fewshot_delimiter
:
str
=
"
\n\n
"
fewshot_config
:
dict
=
None
# runtime configuration options
num_fewshot
:
int
=
0
num_fewshot
:
int
=
-
1
# scoring options
metric_list
:
list
=
None
output_type
:
str
=
"generate_until"
...
...
lm_eval/evaluator.py
View file @
52f75f0e
...
...
@@ -134,13 +134,17 @@ def simple_evaluate(
config
[
"generation_kwargs"
].
update
(
gen_kwargs
)
if
num_fewshot
is
not
None
:
if
config
[
"num_fewshot"
]
>
0
:
if
config
[
"num_fewshot"
]
==
0
:
eval_logger
.
info
(
f
"num_fewshot has been set to 0 for
{
task_name
}
in its config. Manual configuration will be ignored."
)
else
:
default_num_fewshot
=
config
[
"num_fewshot"
]
eval_logger
.
warning
(
f
"Overwriting default num_fewshot of
{
task_name
}
from
{
default_num_fewshot
}
to
{
num_fewshot
}
"
)
task_obj
.
_config
[
"num_fewshot"
]
=
num_fewshot
task_obj
.
_config
[
"num_fewshot"
]
=
num_fewshot
if
check_integrity
:
run_task_tests
(
task_list
=
tasks
)
...
...
@@ -233,6 +237,8 @@ def evaluate(
# store the ordering of tasks and groups
task_order
=
collections
.
defaultdict
(
int
)
task_group_alias
=
collections
.
defaultdict
(
dict
)
# store num-fewshot value per task
num_fewshot
=
collections
.
defaultdict
(
int
)
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
...
...
@@ -251,6 +257,12 @@ def evaluate(
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
if
"num_fewshot"
in
configs
[
task_name
]:
n_shot
=
configs
[
task_name
][
"num_fewshot"
]
else
:
n_shot
=
-
1
num_fewshot
[
task_name
]
=
n_shot
if
"task_alias"
in
configs
[
task_name
]:
task_group_alias
[
task_name
]
=
configs
[
task_name
][
"task_alias"
]
...
...
@@ -612,11 +624,15 @@ def evaluate(
else
:
groups_agg
[
group
][
"alias"
]
=
tab_string
+
group
for
group_name
,
task_list
in
task_hierarchy
.
items
():
num_fewshot
[
group_name
]
=
num_fewshot
[
task_list
[
0
]]
results_dict
=
{
"results"
:
dict
(
results_agg
.
items
()),
**
({
"groups"
:
dict
(
groups_agg
.
items
())}
if
bool
(
groups_agg
)
else
{}),
"configs"
:
dict
(
sorted
(
configs
.
items
())),
"versions"
:
dict
(
sorted
(
versions
.
items
())),
"n-shot"
:
dict
(
sorted
(
num_fewshot
.
items
())),
}
if
log_samples
:
results_dict
[
"samples"
]
=
dict
(
samples
)
...
...
lm_eval/tasks/blimp/template_yaml
→
lm_eval/tasks/blimp/
_
template_yaml
View file @
52f75f0e
...
...
@@ -5,6 +5,7 @@ validation_split: train
doc_to_text: ""
doc_to_target: 0
doc_to_choice: "{{[sentence_good, sentence_bad]}}"
num_fewshot: 0
should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list:
...
...
lm_eval/tasks/blimp/adjunct_island.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
adjunct_island
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_adjunct_island
lm_eval/tasks/blimp/anaphor_gender_agreement.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
anaphor_gender_agreement
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_anaphor_gender_agreement
lm_eval/tasks/blimp/anaphor_number_agreement.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
anaphor_number_agreement
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_anaphor_number_agreement
lm_eval/tasks/blimp/animate_subject_passive.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
animate_subject_passive
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_animate_subject_passive
lm_eval/tasks/blimp/animate_subject_trans.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
animate_subject_trans
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_animate_subject_trans
lm_eval/tasks/blimp/causative.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
causative
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_causative
lm_eval/tasks/blimp/complex_NP_island.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
complex_NP_island
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_complex_NP_island
lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
coordinate_structure_constraint_complex_left_branch
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_coordinate_structure_constraint_complex_left_branch
lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
coordinate_structure_constraint_object_extraction
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_coordinate_structure_constraint_object_extraction
lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_1
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_1
lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_2
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_2
lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_irregular_1
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_irregular_1
lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_irregular_2
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_irregular_2
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_with_adj_2
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_with_adj_2
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_with_adj_irregular_1
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_with_adj_irregular_1
lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_with_adj_irregular_2
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_with_adj_irregular_2
lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml
View file @
52f75f0e
# Generated by utils.py
dataset_name
:
determiner_noun_agreement_with_adjective_1
include
:
template_yaml
include
:
_
template_yaml
task
:
blimp_determiner_noun_agreement_with_adjective_1
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment