Merge branch 'main' into weight_by_size

9822b06e · Lintang Sutawika · GitHub · 51f27158 · b177c82c · 9822b06e
Unverified Commit 9822b06e authored Mar 01, 2024 by Lintang Sutawika Committed by GitHub Mar 01, 2024
20 changed files
--- a/lm_eval/tasks/ammlu/ammlu_professional_accounting.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_professional_accounting.yaml
+"dataset_name": "professional_accounting"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_accounting"
--- a/lm_eval/tasks/ammlu/ammlu_professional_law.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_professional_law.yaml
+"dataset_name": "professional_law"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_law"
--- a/lm_eval/tasks/ammlu/ammlu_professional_medicine.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_professional_medicine.yaml
+"dataset_name": "professional_medicine"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_medicine"
--- a/lm_eval/tasks/ammlu/ammlu_professional_psychology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_professional_psychology.yaml
+"dataset_name": "professional_psychology"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_professional_psychology"
--- a/lm_eval/tasks/ammlu/ammlu_public_relations.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_public_relations.yaml
+"dataset_name": "public_relations"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_public_relations"
--- a/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_security_studies.yaml
+"dataset_name": "security_studies"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_security_studies"
--- a/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_sociology.yaml
+"dataset_name": "sociology"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_sociology"
--- a/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_us_foreign_policy.yaml
+"dataset_name": "us_foreign_policy"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_us_foreign_policy"
--- a/lm_eval/tasks/ammlu/ammlu_virology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_virology.yaml
+"dataset_name": "virology"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_virology"
--- a/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_world_religions.yaml
+"dataset_name": "world_religions"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_world_religions"
--- a/lm_eval/tasks/arc/README.md
+++ b/lm_eval/tasks/arc/README.md
@@ -38,7 +38,7 @@ Homepage: https://allenai.org/data/arc
 #### Tasks
 * `arc_easy`
-* `arc_challange`
+* `arc_challenge`
 ### Checklist

--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+import argparse
 import os
 import re
-import yaml
-import requests
-import argparse
 import datasets
+import requests
+import yaml
 from tqdm import tqdm
 from lm_eval import utils

--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -28,3 +28,4 @@ filter_list:
 num_fewshot: 0
 metadata:
  version: 2.0
+  num_fewshot: 3 # controls what is printed in n-shot
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -7,21 +7,21 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    # ignore_case: true
+    ignore_case: true
    # ignore_punctuation: true
+    regexes_to_ignore:
+      - "\\.$"
+      - ","
+      - "\\\\"
+      - "\n"
+      - '"'
 generation_kwargs:
  until:
    - "</s>"
-    - "Q"
+    - "Q:"
-    - "\n\n"
+    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
-      - function: "take_first"
 num_fewshot: 0
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_boolean_expressions"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(True|False)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_causal_judgement"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_date_understanding"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_disambiguation_qa"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_dyck_languages"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_formal_fallacies"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(valid|invalid)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"