Commit f77a3a27 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups
parents 109ed1c7 f8342178
...@@ -6,7 +6,7 @@ task: ...@@ -6,7 +6,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -19,7 +19,7 @@ task: ...@@ -19,7 +19,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -32,7 +32,7 @@ task: ...@@ -32,7 +32,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -44,7 +44,7 @@ task: ...@@ -44,7 +44,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -56,7 +56,7 @@ task: ...@@ -56,7 +56,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r1 training_split: train_r1
validation_split: dev_r1 validation_split: dev_r1
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -68,7 +68,7 @@ task: ...@@ -68,7 +68,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r2 training_split: train_r2
validation_split: dev_r2 validation_split: dev_r2
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -80,7 +80,7 @@ task: ...@@ -80,7 +80,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r3 training_split: train_r3
validation_split: dev_r3 validation_split: dev_r3
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -93,7 +93,7 @@ task: ...@@ -93,7 +93,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -105,7 +105,7 @@ task: ...@@ -105,7 +105,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -118,7 +118,7 @@ task: ...@@ -118,7 +118,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
......
...@@ -175,8 +175,8 @@ all_subtasks = [ ...@@ -175,8 +175,8 @@ all_subtasks = [
def main() -> None: def main() -> None:
for path, task_type in zip( for path, task_type in zip(
["multiple_choice", "greedy_until"], ["multiple_choice", "generate_until"],
["multiple_choice_template_yaml", "greedy_until_template_yaml"], ["multiple_choice_template_yaml", "generate_until_template_yaml"],
): ):
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
for task in all_subtasks: for task in all_subtasks:
......
# Generated by utils.py # Generated by utils.py
dataset_name: abstract_narrative_understanding_zero_shot dataset_name: abstract_narrative_understanding_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_abstract_narrative_understanding_greedy_until task: bigbench_abstract_narrative_understanding_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: anachronisms_zero_shot dataset_name: anachronisms_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_anachronisms_greedy_until task: bigbench_anachronisms_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: analogical_similarity_zero_shot dataset_name: analogical_similarity_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_analogical_similarity_greedy_until task: bigbench_analogical_similarity_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: analytic_entailment_zero_shot dataset_name: analytic_entailment_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_analytic_entailment_greedy_until task: bigbench_analytic_entailment_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: arithmetic_zero_shot dataset_name: arithmetic_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_arithmetic_greedy_until task: bigbench_arithmetic_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: ascii_word_recognition_zero_shot dataset_name: ascii_word_recognition_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_ascii_word_recognition_greedy_until task: bigbench_ascii_word_recognition_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: authorship_verification_zero_shot dataset_name: authorship_verification_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_authorship_verification_greedy_until task: bigbench_authorship_verification_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: auto_categorization_zero_shot dataset_name: auto_categorization_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_auto_categorization_greedy_until task: bigbench_auto_categorization_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: auto_debugging_zero_shot dataset_name: auto_debugging_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_auto_debugging_greedy_until task: bigbench_auto_debugging_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: bbq_lite_json_zero_shot dataset_name: bbq_lite_json_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_bbq_lite_json_greedy_until task: bigbench_bbq_lite_json_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: bridging_anaphora_resolution_barqa_zero_shot dataset_name: bridging_anaphora_resolution_barqa_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_bridging_anaphora_resolution_barqa_greedy_until task: bigbench_bridging_anaphora_resolution_barqa_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: causal_judgment_zero_shot dataset_name: causal_judgment_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_causal_judgment_greedy_until task: bigbench_causal_judgment_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: cause_and_effect_zero_shot dataset_name: cause_and_effect_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_cause_and_effect_greedy_until task: bigbench_cause_and_effect_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: checkmate_in_one_zero_shot dataset_name: checkmate_in_one_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_checkmate_in_one_greedy_until task: bigbench_checkmate_in_one_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: chess_state_tracking_zero_shot dataset_name: chess_state_tracking_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_chess_state_tracking_greedy_until task: bigbench_chess_state_tracking_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: chinese_remainder_theorem_zero_shot dataset_name: chinese_remainder_theorem_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_chinese_remainder_theorem_greedy_until task: bigbench_chinese_remainder_theorem_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: cifar10_classification_zero_shot dataset_name: cifar10_classification_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_cifar10_classification_greedy_until task: bigbench_cifar10_classification_generate_until
# Generated by utils.py # Generated by utils.py
dataset_name: code_line_description_zero_shot dataset_name: code_line_description_zero_shot
include: ../greedy_until_template_yaml include: ../generate_until_template_yaml
task: bigbench_code_line_description_greedy_until task: bigbench_code_line_description_generate_until
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment