Commit 52f75f0e authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into versioning

parents 331d7c51 b072bb0d
......@@ -81,7 +81,7 @@ class TaskConfig(dict):
fewshot_delimiter: str = "\n\n"
fewshot_config: dict = None
# runtime configuration options
num_fewshot: int = 0
num_fewshot: int = -1
# scoring options
metric_list: list = None
output_type: str = "generate_until"
......
......@@ -134,13 +134,17 @@ def simple_evaluate(
config["generation_kwargs"].update(gen_kwargs)
if num_fewshot is not None:
if config["num_fewshot"] > 0:
if config["num_fewshot"] == 0:
eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
)
else:
default_num_fewshot = config["num_fewshot"]
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj._config["num_fewshot"] = num_fewshot
task_obj._config["num_fewshot"] = num_fewshot
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -233,6 +237,8 @@ def evaluate(
# store the ordering of tasks and groups
task_order = collections.defaultdict(int)
task_group_alias = collections.defaultdict(dict)
# store num-fewshot value per task
num_fewshot = collections.defaultdict(int)
# get lists of each type of request
for task_name, task in task_dict.items():
......@@ -251,6 +257,12 @@ def evaluate(
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
if "num_fewshot" in configs[task_name]:
n_shot = configs[task_name]["num_fewshot"]
else:
n_shot = -1
num_fewshot[task_name] = n_shot
if "task_alias" in configs[task_name]:
task_group_alias[task_name] = configs[task_name]["task_alias"]
......@@ -612,11 +624,15 @@ def evaluate(
else:
groups_agg[group]["alias"] = tab_string + group
for group_name, task_list in task_hierarchy.items():
num_fewshot[group_name] = num_fewshot[task_list[0]]
results_dict = {
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
}
if log_samples:
results_dict["samples"] = dict(samples)
......
......@@ -5,6 +5,7 @@ validation_split: train
doc_to_text: ""
doc_to_target: 0
doc_to_choice: "{{[sentence_good, sentence_bad]}}"
num_fewshot: 0
should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list:
......
# Generated by utils.py
dataset_name: adjunct_island
include: template_yaml
include: _template_yaml
task: blimp_adjunct_island
# Generated by utils.py
dataset_name: anaphor_gender_agreement
include: template_yaml
include: _template_yaml
task: blimp_anaphor_gender_agreement
# Generated by utils.py
dataset_name: anaphor_number_agreement
include: template_yaml
include: _template_yaml
task: blimp_anaphor_number_agreement
# Generated by utils.py
dataset_name: animate_subject_passive
include: template_yaml
include: _template_yaml
task: blimp_animate_subject_passive
# Generated by utils.py
dataset_name: animate_subject_trans
include: template_yaml
include: _template_yaml
task: blimp_animate_subject_trans
# Generated by utils.py
dataset_name: causative
include: template_yaml
include: _template_yaml
task: blimp_causative
# Generated by utils.py
dataset_name: complex_NP_island
include: template_yaml
include: _template_yaml
task: blimp_complex_NP_island
# Generated by utils.py
dataset_name: coordinate_structure_constraint_complex_left_branch
include: template_yaml
include: _template_yaml
task: blimp_coordinate_structure_constraint_complex_left_branch
# Generated by utils.py
dataset_name: coordinate_structure_constraint_object_extraction
include: template_yaml
include: _template_yaml
task: blimp_coordinate_structure_constraint_object_extraction
# Generated by utils.py
dataset_name: determiner_noun_agreement_1
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_1
# Generated by utils.py
dataset_name: determiner_noun_agreement_2
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_2
# Generated by utils.py
dataset_name: determiner_noun_agreement_irregular_1
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_irregular_1
# Generated by utils.py
dataset_name: determiner_noun_agreement_irregular_2
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_irregular_2
# Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_2
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_2
# Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_irregular_1
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_irregular_1
# Generated by utils.py
dataset_name: determiner_noun_agreement_with_adj_irregular_2
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_with_adj_irregular_2
# Generated by utils.py
dataset_name: determiner_noun_agreement_with_adjective_1
include: template_yaml
include: _template_yaml
task: blimp_determiner_noun_agreement_with_adjective_1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment