Commit 52f75f0e authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into versioning

parents 331d7c51 b072bb0d
...@@ -81,7 +81,7 @@ class TaskConfig(dict): ...@@ -81,7 +81,7 @@ class TaskConfig(dict):
fewshot_delimiter: str = "\n\n" fewshot_delimiter: str = "\n\n"
fewshot_config: dict = None fewshot_config: dict = None
# runtime configuration options # runtime configuration options
num_fewshot: int = 0 num_fewshot: int = -1
# scoring options # scoring options
metric_list: list = None metric_list: list = None
output_type: str = "generate_until" output_type: str = "generate_until"
......
...@@ -134,13 +134,17 @@ def simple_evaluate( ...@@ -134,13 +134,17 @@ def simple_evaluate(
config["generation_kwargs"].update(gen_kwargs) config["generation_kwargs"].update(gen_kwargs)
if num_fewshot is not None: if num_fewshot is not None:
if config["num_fewshot"] > 0: if config["num_fewshot"] == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
default_num_fewshot = config["num_fewshot"] default_num_fewshot = config["num_fewshot"]
eval_logger.warning( eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
) )
task_obj._config["num_fewshot"] = num_fewshot task_obj._config["num_fewshot"] = num_fewshot
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -233,6 +237,8 @@ def evaluate( ...@@ -233,6 +237,8 @@ def evaluate(
# store the ordering of tasks and groups # store the ordering of tasks and groups
task_order = collections.defaultdict(int) task_order = collections.defaultdict(int)
task_group_alias = collections.defaultdict(dict) task_group_alias = collections.defaultdict(dict)
# store num-fewshot value per task
num_fewshot = collections.defaultdict(int)
# get lists of each type of request # get lists of each type of request
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
...@@ -251,6 +257,12 @@ def evaluate( ...@@ -251,6 +257,12 @@ def evaluate(
versions[task_name] = task.VERSION versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config()) configs[task_name] = dict(task.dump_config())
if "num_fewshot" in configs[task_name]:
n_shot = configs[task_name]["num_fewshot"]
else:
n_shot = -1
num_fewshot[task_name] = n_shot
if "task_alias" in configs[task_name]: if "task_alias" in configs[task_name]:
task_group_alias[task_name] = configs[task_name]["task_alias"] task_group_alias[task_name] = configs[task_name]["task_alias"]
...@@ -612,11 +624,15 @@ def evaluate( ...@@ -612,11 +624,15 @@ def evaluate(
else: else:
groups_agg[group]["alias"] = tab_string + group groups_agg[group]["alias"] = tab_string + group
for group_name, task_list in task_hierarchy.items():
num_fewshot[group_name] = num_fewshot[task_list[0]]
results_dict = { results_dict = {
"results": dict(results_agg.items()), "results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}), **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
} }
if log_samples: if log_samples:
results_dict["samples"] = dict(samples) results_dict["samples"] = dict(samples)
......
...@@ -5,6 +5,7 @@ validation_split: train ...@@ -5,6 +5,7 @@ validation_split: train
doc_to_text: "" doc_to_text: ""
doc_to_target: 0 doc_to_target: 0
doc_to_choice: "{{[sentence_good, sentence_bad]}}" doc_to_choice: "{{[sentence_good, sentence_bad]}}"
num_fewshot: 0
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list: metric_list:
......
# Generated by utils.py # Generated by utils.py
dataset_name: adjunct_island dataset_name: adjunct_island
include: template_yaml include: _template_yaml
task: blimp_adjunct_island task: blimp_adjunct_island
# Generated by utils.py # Generated by utils.py
dataset_name: anaphor_gender_agreement dataset_name: anaphor_gender_agreement
include: template_yaml include: _template_yaml
task: blimp_anaphor_gender_agreement task: blimp_anaphor_gender_agreement
# Generated by utils.py # Generated by utils.py
dataset_name: anaphor_number_agreement dataset_name: anaphor_number_agreement
include: template_yaml include: _template_yaml
task: blimp_anaphor_number_agreement task: blimp_anaphor_number_agreement
# Generated by utils.py # Generated by utils.py
dataset_name: animate_subject_passive dataset_name: animate_subject_passive
include: template_yaml include: _template_yaml
task: blimp_animate_subject_passive task: blimp_animate_subject_passive
# Generated by utils.py # Generated by utils.py
dataset_name: animate_subject_trans dataset_name: animate_subject_trans
include: template_yaml include: _template_yaml
task: blimp_animate_subject_trans task: blimp_animate_subject_trans
# Generated by utils.py # Generated by utils.py
dataset_name: causative dataset_name: causative
include: template_yaml include: _template_yaml
task: blimp_causative task: blimp_causative
# Generated by utils.py # Generated by utils.py
dataset_name: complex_NP_island dataset_name: complex_NP_island
include: template_yaml include: _template_yaml
task: blimp_complex_NP_island task: blimp_complex_NP_island
# Generated by utils.py # Generated by utils.py
dataset_name: coordinate_structure_constraint_complex_left_branch dataset_name: coordinate_structure_constraint_complex_left_branch
include: template_yaml include: _template_yaml
task: blimp_coordinate_structure_constraint_complex_left_branch task: blimp_coordinate_structure_constraint_complex_left_branch
# Generated by utils.py # Generated by utils.py
dataset_name: coordinate_structure_constraint_object_extraction dataset_name: coordinate_structure_constraint_object_extraction
include: template_yaml include: _template_yaml
task: blimp_coordinate_structure_constraint_object_extraction task: blimp_coordinate_structure_constraint_object_extraction
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_1 dataset_name: determiner_noun_agreement_1
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_1 task: blimp_determiner_noun_agreement_1
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_2 dataset_name: determiner_noun_agreement_2
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_2 task: blimp_determiner_noun_agreement_2
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_irregular_1 dataset_name: determiner_noun_agreement_irregular_1
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_irregular_1 task: blimp_determiner_noun_agreement_irregular_1
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_irregular_2 dataset_name: determiner_noun_agreement_irregular_2
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_irregular_2 task: blimp_determiner_noun_agreement_irregular_2
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_2 dataset_name: determiner_noun_agreement_with_adj_2
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_2 task: blimp_determiner_noun_agreement_with_adj_2
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_irregular_1 dataset_name: determiner_noun_agreement_with_adj_irregular_1
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_irregular_1 task: blimp_determiner_noun_agreement_with_adj_irregular_1
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_irregular_2 dataset_name: determiner_noun_agreement_with_adj_irregular_2
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_irregular_2 task: blimp_determiner_noun_agreement_with_adj_irregular_2
# Generated by utils.py # Generated by utils.py
dataset_name: determiner_noun_agreement_with_adjective_1 dataset_name: determiner_noun_agreement_with_adjective_1
include: template_yaml include: _template_yaml
task: blimp_determiner_noun_agreement_with_adjective_1 task: blimp_determiner_noun_agreement_with_adjective_1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment