added deepseekv2

74df9bea · zhaoying1 · 74df9bea · 74df9bea · 74df9bea · 74df9bea
Commit 74df9bea authored Sep 02, 2024 by zhaoying1
20 changed files
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml
+"dataset_name": "logical_deduction_three_objects"
+"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_logical_deduction_three_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml
+"dataset_name": "movie_recommendation"
+"description": "Recommend movies similar to the given list of movies.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_movie_recommendation"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml
+"dataset_name": "multistep_arithmetic_two"
+"description": "Solve multi-step arithmetic problems.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_multistep_arithmetic_two"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.NumberParseRegexFilter
+        group_select: -1
+        regex_pattern: "([-0-9]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml
+"dataset_name": "navigate"
+"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_navigate"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "\\b(Yes|No|yes|no)\\b"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml
+"dataset_name": "object_counting"
+"description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_object_counting"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.NumberParseRegexFilter
+        group_select: -1
+        regex_pattern: "([-0-9]+)"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml
+"dataset_name": "penguins_in_a_table"
+"description": "Answer questions about a table of penguins and their attributes.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_penguins_in_a_table"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml
+"dataset_name": "reasoning_about_colored_objects"
+"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_reasoning_about_colored_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml
+"dataset_name": "ruin_names"
+"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_ruin_names"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml
+"dataset_name": "salient_translation_error_detection"
+"description": "Detect the type of error in an English translation of a German source sentence.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_salient_translation_error_detection"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml
+"dataset_name": "snarks"
+"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_snarks"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml
+"dataset_name": "sports_understanding"
+"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_sports_understanding"
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MapRegexFilter
+        group_select: -1
+        ignore_case: true
+        regex_pattern_to_value:
+          \b(no|not plausible)\b: "no"
+          \b(yes|plausible)\b: "yes"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_temporal_sequences"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_five_objects.yaml
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_five_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_tracking_shuffled_objects_three_objects"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+import collections
+import re
+import sys
+import unicodedata
+
+from lm_eval.filters.extraction import Filter, RegexFilter
+
+
+class ExtendedRegexFilter(RegexFilter):
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def filter_ignores(self, st):
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+
+        if self.ignore_case:
+            st = st.lower()
+
+        if self.ignore_punctuation:
+            # https://stackoverflow.com/a/266162
+            st = st.translate(self.punct_tbl)
+        return st
+
+    def find_match(self, regex, resp, convert_dict={}):
+        match = regex.findall(resp)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m][0]
+            match = match.strip()
+            if match and match in convert_dict:
+                match = convert_dict[match]
+        return match
+
+
+class MapRegexFilter(ExtendedRegexFilter):
+    def __init__(
+        self,
+        regex_pattern_to_value: dict = {},
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern_to_value: Match the regex pattern and change the result into the value
+        group_select: Selects the (group_select)th match from the findall result. We use the whole regex_patterns, concatenated by |
+        ignore_case: Lowers the case of response before matching with the given regex
+        ignore_punctuation: Remove the punctuation before matching with the given regex
+        regexes_to_ignore: Remove these regexes before matching with the given regex
+        """
+        super().__init__(
+            "|".join(list(regex_pattern_to_value.keys())),
+            group_select,
+            fallback,
+            ignore_case,
+            ignore_punctuation,
+            regexes_to_ignore,
+        )
+        self.regex_to_value = {
+            re.compile(r): v for r, v in regex_pattern_to_value.items()
+        }
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                whole_match_considering_group_select = self.find_match(
+                    self.regex, self.filter_ignores(resp)
+                )
+                if whole_match_considering_group_select:
+                    for regex, mapped_value in self.regex_to_value.items():
+                        match = self.find_match(
+                            regex,
+                            self.filter_ignores(whole_match_considering_group_select),
+                        )
+                        if match:
+                            match = mapped_value
+                            break
+                if not whole_match_considering_group_select or not match:
+                    match = self.fallback
+
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class NumberParseRegexFilter(ExtendedRegexFilter):
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        filtered_resps = []
+        import regex
+        from word2number import w2n
+
+        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
+        english_number_regex = regex.compile(
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+        )
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(english_number_regex, resp.lower())
+                    if match:
+                        match = str(w2n.word_to_num(match))
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class WordSortFilter(Filter):
+    """ """
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            words = doc["input"].split("List:")[1].strip().split()
+            regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
+            filtered = []
+            for resp in r:
+                match = regex.findall(resp)
+                match.reverse()
+                ordered_words = reversed(
+                    collections.OrderedDict(zip(match, [None] * len(match)))
+                )
+                filtered.append(" ".join(ordered_words))
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class MultiChoiceRegexFilter(ExtendedRegexFilter):
+    def __init__(self, *args, **kwargs):
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(*args, **kwargs)
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
+            match = multiple_choices_regex.findall(doc["input"])
+            for m in match:
+                m = self.filter_ignores(m.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )
+
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(
+                        fallback_regex, self.filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = self.find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_web_of_lies"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MapRegexFilter
+        group_select: -1
+        ignore_case: true
+        regex_pattern_to_value:
+          \b(no|does not tell the truth|is not telling the truth)\b: "no"
+          \b(yes|tells the truth|is telling the truth)\b: "yes"
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: {{input}}\nA: Let's think step by step."
+"include": "_cot_zeroshot_template_yaml"
+"task": "bbh_cot_zeroshot_word_sorting"
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.WordSortFilter
+      - function: "take_first"
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+group: bbh_fewshot
+dataset_path: lukaemon/bbh
+output_type: generate_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    # ignore_case: true
+    # ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+    - "Q"
+    - "\n\n"
+  do_sample: false
+  temperature: 0.0
+num_fewshot: 0
+metadata:
+  version: 1.0
+  num_fewshot: 3 # will be printed in results table
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": "Q: not ( ( not not True ) ) is\nA: False\n\nQ: True and False and not True and True is\nA: False\n\nQ: not not ( not ( False ) ) is\nA: True\n\nQ: {{input}}\nA:"
+"include": "_fewshot_template_yaml"
+"task": "bbh_fewshot_boolean_expressions"