update bbh, gsm8k, mmlu parsing logic and prompts (Orca2 bbh_cot_zeroshot 0% -> 42%) (#1356)

* update bbh, gsm8k, mmlu parsing logic and prompts * remove the formatting prompt (bbh) + minor update (mmlu) * update bbh, gsm8k, mmlu zeroshot, revert fewshots * update bbh, gsm8k, mmlu version, forward changes to gsm8k-cot * remove take_last, update to use docs parameters * add newline * ruff formatting * Update pyproject.toml * fix format --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

update bbh, gsm8k, mmlu parsing logic and prompts (Orca2 bbh_cot_zeroshot 0% -> 42%) (#1356)
* update bbh, gsm8k, mmlu parsing logic and prompts * remove the formatting prompt (bbh) + minor update (mmlu) * update bbh, gsm8k, mmlu zeroshot, revert fewshots * update bbh, gsm8k, mmlu version, forward changes to gsm8k-cot * remove take_last, update to use docs parameters * add newline * ruff formatting * Update pyproject.toml * fix format --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
89deeeaf · thnkinbtfly · GitHub · 19cbb292 · 89deeeaf · 89deeeaf
Unverified Commit 89deeeaf authored Feb 20, 2024 by thnkinbtfly Committed by GitHub Feb 19, 2024
20 changed files
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -7,7 +7,10 @@ class RegexFilter(Filter):
    """ """

    def __init__(
-        self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]"
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
    ) -> None:
        """
        pass a string `regex` to run `re.compile(r"regex")` on.
@@ -15,6 +18,7 @@ class RegexFilter(Filter):
        """
        self.regex_pattern = regex_pattern
        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
        self.fallback = fallback

    def apply(self, resps, docs):
@@ -25,9 +29,12 @@ class RegexFilter(Filter):
        def filter_set(inst):
            filtered = []
            for resp in inst:
-                match = self.regex.search(resp)
+                match = self.regex.findall(resp)
                if match:
-                    match = match.group(1).strip()
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m][0]
+                    match = match.strip()
                else:
                    match = self.fallback
                filtered.append(match)

--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -7,21 +7,21 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    # ignore_case: true
+    ignore_case: true
    # ignore_punctuation: true
+    regexes_to_ignore:
+      - "\\.$"
+      - ","
+      - "\\\\"
+      - "\n"
+      - '"'
 generation_kwargs:
  until:
    - "</s>"
-    - "Q"
-    - "\n\n"
+    - "Q:"
+    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
-      - function: "take_first"
 num_fewshot: 0
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml
 "dataset_name": "boolean_expressions"
 "description": "Evaluate the result of a random Boolean expression.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_boolean_expressions"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(True|False)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml
 "dataset_name": "causal_judgement"
 "description": "Answer questions about causal attribution.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_causal_judgement"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml
 "dataset_name": "date_understanding"
 "description": "Infer the date from context.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_date_understanding"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml
 "dataset_name": "disambiguation_qa"
 "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_disambiguation_qa"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml
 "dataset_name": "dyck_languages"
 "description": "Correctly close a Dyck-n word.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_dyck_languages"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml
 "dataset_name": "formal_fallacies"
 "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_formal_fallacies"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(valid|invalid)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/geometric_shapes.yaml
 "dataset_name": "geometric_shapes"
 "description": "Name geometric shapes from their SVG paths.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_geometric_shapes"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/hyperbaton.yaml
 "dataset_name": "hyperbaton"
 "description": "Order adjectives correctly in English sentences.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_hyperbaton"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml
 "dataset_name": "logical_deduction_five_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_logical_deduction_five_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml
 "dataset_name": "logical_deduction_seven_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_logical_deduction_seven_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml
 "dataset_name": "logical_deduction_three_objects"
 "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_logical_deduction_three_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml
 "dataset_name": "movie_recommendation"
 "description": "Recommend movies similar to the given list of movies.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_movie_recommendation"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
 "dataset_name": "multistep_arithmetic_two"
 "description": "Solve multi-step arithmetic problems.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_multistep_arithmetic_two"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.NumberParseRegexFilter
+        group_select: -1
+        regex_pattern: "([-0-9]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
+
--- a/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml
 "dataset_name": "navigate"
 "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_navigate"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
 "dataset_name": "object_counting"
 "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_object_counting"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.NumberParseRegexFilter
+        group_select: -1
+        regex_pattern: "([-0-9]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
+
--- a/lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml
 "dataset_name": "penguins_in_a_table"
 "description": "Answer questions about a table of penguins and their attributes.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_penguins_in_a_table"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml
 "dataset_name": "reasoning_about_colored_objects"
 "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_reasoning_about_colored_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml
 "dataset_name": "ruin_names"
 "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_ruin_names"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"