merged with latest update

ab96fc7e · lintangsutawika · bf2517cc · 8680e938 · ab96fc7e · ab96fc7e
Commit ab96fc7e authored Feb 20, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
 "dataset_name": "tracking_shuffled_objects_seven_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
 "dataset_name": "tracking_shuffled_objects_three_objects"
 "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_tracking_shuffled_objects_three_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+import collections
+import re
+import sys
+
+import unicodedata
+
+from lm_eval.filters.extraction import RegexFilter, Filter
+
+
+class ExtendedRegexFilter(RegexFilter):
+    punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
+                              if unicodedata.category(chr(i)).startswith('P'))
+
+    def __init__(
+            self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", group_select=0, fallback: str = "[invalid]",
+            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+    ) -> None:
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def filter_ignores(self, st):
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+
+        if self.ignore_case:
+            st = st.lower()
+
+        if self.ignore_punctuation:
+            # https://stackoverflow.com/a/266162
+            st = st.translate(self.punct_tbl)
+        return st
+
+    def find_match(self, regex, resp, convert_dict={}):
+        match = regex.findall(resp)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m][0]
+            match = match.strip()
+            if match and match in convert_dict:
+                match = convert_dict[match]
+        return match
+
+
+class MapRegexFilter(ExtendedRegexFilter):
+    def __init__(
+            self, regex_pattern_to_value: dict = {}, group_select=0, fallback: str = "[invalid]",
+            ignore_case=False, ignore_punctuation=False, regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern_to_value: Match the regex pattern and change the result into the value
+        group_select: Selects the (group_select)th match from the findall result. We use the whole regex_patterns, concatenated by |
+        ignore_case: Lowers the case of response before matching with the given regex
+        ignore_punctuation: Remove the punctuation before matching with the given regex
+        regexes_to_ignore: Remove these regexes before matching with the given regex
+        """
+        super().__init__('|'.join(list(regex_pattern_to_value.keys())), group_select, fallback, ignore_case, ignore_punctuation, regexes_to_ignore)
+        self.regex_to_value = {re.compile(r): v for r, v in regex_pattern_to_value.items()}
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                whole_match_considering_group_select = self.find_match(self.regex, self.filter_ignores(resp))
+                if whole_match_considering_group_select:
+                    for regex, mapped_value in self.regex_to_value.items():
+                        match = self.find_match(regex, self.filter_ignores(whole_match_considering_group_select))
+                        if match:
+                            match = mapped_value
+                            break
+                if not whole_match_considering_group_select or not match:
+                    match = self.fallback
+
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class NumberParseRegexFilter(ExtendedRegexFilter):
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        filtered_resps = []
+        import regex
+        from word2number import w2n
+        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
+        english_number_regex = regex.compile(
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))")
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(english_number_regex, resp.lower())
+                    if match:
+                        match = str(w2n.word_to_num(match))
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class WordSortFilter(Filter):
+    """ """
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            words = doc['input'].split("List:")[1].strip().split()
+            regex = re.compile('|'.join([f"\\b{w}\\b" for w in words]))
+            filtered = []
+            for resp in r:
+                match = regex.findall(resp)
+                match.reverse()
+                ordered_words = reversed(collections.OrderedDict(zip(match, [None] * len(match))))
+                filtered.append(' '.join(ordered_words))
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class MultiChoiceRegexFilter(ExtendedRegexFilter):
+
+    def __init__(self, *args, **kwargs):
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(*args, **kwargs)
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = 'A'
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
+            match = multiple_choices_regex.findall(doc['input'])
+            for m in match:
+                m = self.filter_ignores(m.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile('|'.join(fallback_regexes))
+            without_paren_fallback_regex = '|'.join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})")
+
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(fallback_regex, self.filter_ignores(resp), choice_to_alpha)
+                    if not match:
+                        match = self.find_match(without_paren_fallback_regex, resp, without_paren_to_target)
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
 "dataset_name": "web_of_lies"
 "description": "Evaluate a random boolean function expressed as a word problem.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_web_of_lies"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MapRegexFilter
+        group_select: -1
+        ignore_case: true
+        regex_pattern_to_value:
+          \b(no|does not tell the truth|is not telling the truth)\b: "no"
+          \b(yes|tells the truth|is telling the truth)\b: "yes"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
 "dataset_name": "word_sorting"
 "description": "Sort a list of words.\n\n"
-"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
 "include": "_cot_zeroshot_template_yaml"
 "task": "bbh_cot_zeroshot_word_sorting"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.WordSortFilter
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -7,16 +7,22 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    # ignore_case: true
+    ignore_case: true
    # ignore_punctuation: true
+    regexes_to_ignore:
+      - "\\.$"
+      - ","
+      - "\n"
+      - "\\\\"
+      - '"'
 generation_kwargs:
  until:
    - "</s>"
    - "Q:"
-    - "\n\n"
+    - "<|im_end|>"
    - "<0x0A>"
  do_sample: false
  temperature: 0.0
 num_fewshot: 0
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml
@@ -3,3 +3,14 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_boolean_expressions"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: 0
+        regex_pattern: "\\b(True|False)\\b"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml
@@ -3,3 +3,14 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_causal_judgement"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: 0
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/date_understanding.yaml
@@ -3,3 +3,16 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_date_understanding"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml
@@ -3,3 +3,16 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_disambiguation_qa"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml
@@ -3,3 +3,13 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_dyck_languages"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: 0
+        regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml
@@ -3,3 +3,14 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_formal_fallacies"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: 0
+        regex_pattern: "\\b(valid|invalid)\\b"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml
@@ -3,3 +3,16 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_geometric_shapes"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml
@@ -3,3 +3,16 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_hyperbaton"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml
@@ -3,3 +3,15 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_logical_deduction_five_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml
@@ -3,3 +3,15 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_logical_deduction_seven_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml
@@ -3,3 +3,15 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_logical_deduction_three_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml
@@ -3,3 +3,15 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_movie_recommendation"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml
@@ -3,3 +3,14 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_multistep_arithmetic_two"
+
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.NumberParseRegexFilter
+        group_select: 0
+        regex_pattern: "([-0-9]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/zeroshot/navigate.yaml
+++ b/lm_eval/tasks/bbh/zeroshot/navigate.yaml
@@ -3,3 +3,13 @@
 "doc_to_text": "Q: {{input}}\nA:"
 "include": "_zeroshot_template_yaml"
 "task": "bbh_zeroshot_navigate"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: 0
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"