Merge branch 'main' into inverse-scaling-tasks

60c9c170 · haileyschoelkopf · 4b2d565b · b4cd85d4 · 60c9c170 · 60c9c170
Commit 60c9c170 authored May 29, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/basqueglue/wic.yaml
+++ b/lm_eval/tasks/basqueglue/wic.yaml
+group: basque-glue
+task: wiceu
+dataset_path: orai-nlp/basqueGLUE
+dataset_name: wic
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_wic_docs
+doc_to_text: "1. esaldia: {{sentence1}}\n2. esaldia: {{sentence2}}\nGaldera: Aurreko bi esaldietan, \"{{word}}\" hitzak esanahi berdina du?\nErantzuna:"
+doc_to_target: label
+doc_to_choice: ['ez', 'bai']
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
 import os

+import datasets
 import yaml


@@ -173,6 +174,11 @@ all_subtasks = [
    "word_unscrambling",
 ]

+skip_tasks = [
+    "simple_arithmetic_json_multiple_choice",
+    "simple_arithmetic_multiple_targets_json",
+]
+

 def main() -> None:
    for path, task_type in zip(
@@ -183,11 +189,29 @@ def main() -> None:
        for task in all_subtasks:
            file_name = f"{task}.yaml"
            try:
+                template_file = task_type
+                if path == "multiple_choice":
+                    print(f"Checking {task} for multiple choices")
+                    if task in skip_tasks:
+                        continue
+                    data = datasets.load_dataset("hails/bigbench", task + "_zero_shot")
+                    multiple_choice_targets = data["default"][0][
+                        "multiple_choice_targets"
+                    ]
+                    if len(multiple_choice_targets) == 0:
+                        continue
+                    else:
+                        template_file = "multiple_choice_template_b_yaml"
+                        if set(data["default"][0]["targets"]) < set(
+                            multiple_choice_targets
+                        ):
+                            template_file = "multiple_choice_template_a_yaml"
+
                with open(f"{path}/{file_name}", "w", encoding="utf-8") as f:
                    f.write("# Generated by utils.py\n")
                    yaml.dump(
                        {
-                            "include": f"../{task_type}",
+                            "include": f"../{template_file}",
                            "task": "bigbench_"
                            + task
                            + "_{}".format(task_type.split("_template_yaml")[0]),

--- a/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml
 # Generated by utils.py
 dataset_name: abstract_narrative_understanding_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_abstract_narrative_understanding_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml
 # Generated by utils.py
 dataset_name: anachronisms_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_anachronisms_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml
 # Generated by utils.py
 dataset_name: analogical_similarity_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_analogical_similarity_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml
 # Generated by utils.py
 dataset_name: analytic_entailment_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_analytic_entailment_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml
 # Generated by utils.py
 dataset_name: arithmetic_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_arithmetic_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/ascii_word_recognition.yaml
-# Generated by utils.py
-dataset_name: ascii_word_recognition_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_ascii_word_recognition_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml
 # Generated by utils.py
 dataset_name: authorship_verification_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_authorship_verification_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_categorization.yaml
-# Generated by utils.py
-dataset_name: auto_categorization_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_auto_categorization_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/auto_debugging.yaml
-# Generated by utils.py
-dataset_name: auto_debugging_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_auto_debugging_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml
 # Generated by utils.py
 dataset_name: bbq_lite_json_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_bbq_lite_json_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/bridging_anaphora_resolution_barqa.yaml
-# Generated by utils.py
-dataset_name: bridging_anaphora_resolution_barqa_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_bridging_anaphora_resolution_barqa_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml
-# Generated by utils.py
-dataset_name: causal_judgment_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_causal_judgement_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml
 # Generated by utils.py
 dataset_name: causal_judgment_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_causal_judgment_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml
 # Generated by utils.py
 dataset_name: cause_and_effect_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_cause_and_effect_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml
 # Generated by utils.py
 dataset_name: checkmate_in_one_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_checkmate_in_one_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/chess_state_tracking.yaml
-# Generated by utils.py
-dataset_name: chess_state_tracking_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_chess_state_tracking_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/chinese_remainder_theorem.yaml
-# Generated by utils.py
-dataset_name: chinese_remainder_theorem_zero_shot
-include: ../multiple_choice_template_yaml
-task: bigbench_chinese_remainder_theorem_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml
 # Generated by utils.py
 dataset_name: cifar10_classification_zero_shot
-include: ../multiple_choice_template_yaml
+include: ../multiple_choice_template_a_yaml
 task: bigbench_cifar10_classification_multiple_choice