update bbh, gsm8k, mmlu parsing logic and prompts (Orca2 bbh_cot_zeroshot 0% -> 42%) (#1356)

* update bbh, gsm8k, mmlu parsing logic and prompts * remove the formatting prompt (bbh) + minor update (mmlu) * update bbh, gsm8k, mmlu zeroshot, revert fewshots * update bbh, gsm8k, mmlu version, forward changes to gsm8k-cot * remove take_last, update to use docs parameters * add newline * ruff formatting * Update pyproject.toml * fix format --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

update bbh, gsm8k, mmlu parsing logic and prompts (Orca2 bbh_cot_zeroshot 0% -> 42%) (#1356)
* update bbh, gsm8k, mmlu parsing logic and prompts * remove the formatting prompt (bbh) + minor update (mmlu) * update bbh, gsm8k, mmlu zeroshot, revert fewshots * update bbh, gsm8k, mmlu version, forward changes to gsm8k-cot * remove take_last, update to use docs parameters * add newline * ruff formatting * Update pyproject.toml * fix format --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
89deeeaf · thnkinbtfly · GitHub · 19cbb292 · 89deeeaf · 89deeeaf
Unverified Commit 89deeeaf authored Feb 20, 2024 by thnkinbtfly Committed by GitHub Feb 19, 2024
7 changed files
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -25,20 +25,27 @@ metric_list:
      - ","
      - "\\$"
      - "(?s).*#### "
-      - "\n\n"
+      - "\\.$"
 generation_kwargs:
  until:
    - "Q:"
-    - "\n\n"
+    - "</s>"
+    - "<|im_end|>"
  do_sample: false
 repeats: 1
 num_fewshot: 0
 filter_list:
-  - name: "get-answer"
+  - name: "strict-match"
    filter:
      - function: "regex"
        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
 metadata:
-  version: 2.0
+  version: 3.0
  num_fewshot: 8
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -19,19 +19,27 @@ metric_list:
      - ","
      - "\\$"
      - "(?s).*#### "
+      - "\\.$"
 generation_kwargs:
  until:
-    - "\n\n"
    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
 repeats: 1
 num_fewshot: 5
 filter_list:
-  - name: "get-answer"
+  - name: "strict-match"
    filter:
      - function: "regex"
        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -5,14 +5,24 @@ output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:
-  - name: "get-answer"
+  - name: "strict-match"
    filter:
      - function: "regex"
        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
 generation_kwargs:
  until:
    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
 num_fewshot: 0
@@ -23,4 +33,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+import re
+import sys
+
+import unicodedata
+
+from lm_eval.filters.extraction import RegexFilter
+
+
+class MultiChoiceRegexFilter(RegexFilter):
+    """ """
+
+    def __init__(
+            self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
+            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+
+        punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
+                                  if unicodedata.category(chr(i)).startswith('P'))
+
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = 'A'
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc['choices']
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile('|'.join(fallback_regexes))
+            without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+                    if not match:
+                        match = find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -5,12 +5,26 @@ fewshot_split: dev
 output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        regex_pattern: "(\\([A-Z]\\))"
+        ignore_case: true
+        ignore_punctuation: true
+      - function: "take_first"
 generation_kwargs:
  until:
    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+import re
+import sys
+
+import unicodedata
+
+from lm_eval.filters.extraction import RegexFilter
+
+
+class MultiChoiceRegexFilter(RegexFilter):
+    """ """
+
+    def __init__(
+            self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
+            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+
+        punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
+                                  if unicodedata.category(chr(i)).startswith('P'))
+
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = 'A'
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc['choices']
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile('|'.join(fallback_regexes))
+            without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha)
+                    if not match:
+                        match = find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
    "tqdm-multiprocess",
    "transformers>=4.1",
    "zstandard",
+    "word2number",
 ]

 [tool.setuptools.packages.find]