添加Megatron项目

5add46aa · hepj · deb8370c · 5add46aa · 5add46aa · 5add46aa
Commit 5add46aa authored Jan 09, 2025 by hepj
20 changed files
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/temporal_sequences.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/temporal_sequences.yaml
+"dataset_name": "temporal_sequences"
+"description": "Task description: Answer questions about which times certain events could have occurred.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_temporal_sequences"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_five_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_five_objects.yaml
+"dataset_name": "tracking_shuffled_objects_five_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_five_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_seven_objects.yaml
+"dataset_name": "tracking_shuffled_objects_seven_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_seven_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_three_objects.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_three_objects.yaml
+"dataset_name": "tracking_shuffled_objects_three_objects"
+"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_tracking_shuffled_objects_three_objects"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/utils.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/utils.py
+import collections
+import re
+import sys
+import unicodedata
+
+from lm_eval.filters.extraction import Filter, RegexFilter
+
+
+class ExtendedRegexFilter(RegexFilter):
+    punct_tbl = dict.fromkeys(
+        i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")
+    )
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def filter_ignores(self, st):
+        if self.regexes_to_ignore is not None:
+            for s in self.regexes_to_ignore:
+                st = re.sub(s, "", st)
+
+        if self.ignore_case:
+            st = st.lower()
+
+        if self.ignore_punctuation:
+            # https://stackoverflow.com/a/266162
+            st = st.translate(self.punct_tbl)
+        return st
+
+    def find_match(self, regex, resp, convert_dict={}):
+        match = regex.findall(resp)
+        if match:
+            match = match[self.group_select]
+            if isinstance(match, tuple):
+                match = [m for m in match if m][0]
+            match = match.strip()
+            if match and match in convert_dict:
+                match = convert_dict[match]
+        return match
+
+
+class MapRegexFilter(ExtendedRegexFilter):
+    def __init__(
+        self,
+        regex_pattern_to_value: dict = {},
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern_to_value: Match the regex pattern and change the result into the value
+        group_select: Selects the (group_select)th match from the findall result. We use the whole regex_patterns, concatenated by |
+        ignore_case: Lowers the case of response before matching with the given regex
+        ignore_punctuation: Remove the punctuation before matching with the given regex
+        regexes_to_ignore: Remove these regexes before matching with the given regex
+        """
+        super().__init__(
+            "|".join(list(regex_pattern_to_value.keys())),
+            group_select,
+            fallback,
+            ignore_case,
+            ignore_punctuation,
+            regexes_to_ignore,
+        )
+        self.regex_to_value = {
+            re.compile(r): v for r, v in regex_pattern_to_value.items()
+        }
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                whole_match_considering_group_select = self.find_match(
+                    self.regex, self.filter_ignores(resp)
+                )
+                if whole_match_considering_group_select:
+                    for regex, mapped_value in self.regex_to_value.items():
+                        match = self.find_match(
+                            regex,
+                            self.filter_ignores(whole_match_considering_group_select),
+                        )
+                        if match:
+                            match = mapped_value
+                            break
+                if not whole_match_considering_group_select or not match:
+                    match = self.fallback
+
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class NumberParseRegexFilter(ExtendedRegexFilter):
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        filtered_resps = []
+        import regex
+        from word2number import w2n
+
+        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
+        english_number_regex = regex.compile(
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+        )
+
+        for r in resps:
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(english_number_regex, resp.lower())
+                    if match:
+                        match = str(w2n.word_to_num(match))
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class WordSortFilter(Filter):
+    """ """
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            words = doc["input"].split("List:")[1].strip().split()
+            regex = re.compile("|".join([f"\\b{w}\\b" for w in words]))
+            filtered = []
+            for resp in r:
+                match = regex.findall(resp)
+                match.reverse()
+                ordered_words = reversed(
+                    collections.OrderedDict(zip(match, [None] * len(match)))
+                )
+                filtered.append(" ".join(ordered_words))
+            filtered_resps.append(filtered)
+
+        return filtered_resps
+
+
+class MultiChoiceRegexFilter(ExtendedRegexFilter):
+    def __init__(self, *args, **kwargs):
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(*args, **kwargs)
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)")
+            match = multiple_choices_regex.findall(doc["input"])
+            for m in match:
+                m = self.filter_ignores(m.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )
+
+            filtered = []
+            for resp in r:
+                match = self.find_match(self.regex, resp)
+                if not match:
+                    match = self.find_match(
+                        fallback_regex, self.filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = self.find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/web_of_lies.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/web_of_lies.yaml
+"dataset_name": "web_of_lies"
+"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_web_of_lies"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MapRegexFilter
+        group_select: 0
+        ignore_case: true
+        regex_pattern_to_value:
+          \b(no|does not tell the truth|is not telling the truth)\b: "no"
+          \b(yes|tells the truth|is telling the truth)\b: "yes"
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/word_sorting.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bbh/zeroshot/word_sorting.yaml
+"dataset_name": "word_sorting"
+"description": "Sort a list of words.\n\n"
+"doc_to_text": "Q: {{input}}\nA:"
+"include": "_zeroshot_template_yaml"
+"task": "bbh_zeroshot_word_sorting"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.WordSortFilter
+      - function: "take_first"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/README.md
+# Belebele
+
+### Paper
+
+The Belebele Benchmark for Massively Multilingual NLU Evaluation
+https://arxiv.org/abs/2308.16884
+
+Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is reinforced by extensive quality checks. While all questions directly relate to the passage, the English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. Belebele opens up new avenues for evaluating and analyzing the multilingual abilities of language models and NLP systems.
+
+Homepage: https://github.com/facebookresearch/belebele
+
+### Citation
+
+```bibtex
+@misc{bandarkar2023belebele,
+      title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
+      author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
+      year={2023},
+      eprint={2308.16884},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `belebele`: All 122 languages of the Belebele dataset, evaluated following the methodology in MMLU's original implementation.
+
+#### Tasks
+
+
+The following tasks evaluate languages in the Belebele dataset using loglikelihood-based multiple-choice scoring:
+- `belebele_{language}`
+
+The variant evaluated here is the 0-shot or few-shot evaluation with English Instructions.
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/_default_template_yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/_default_template_yaml
+group: belebele
+dataset_path: facebook/belebele
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/_generate_configs.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import argparse
+import os
+
+import requests
+import yaml
+from tqdm import tqdm
+
+from lm_eval.utils import logging
+
+
+API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="belebele")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    def query():
+        response = requests.get(API_URL)
+        return response.json()["splits"]
+
+    print(query())
+    languages = [split["split"] for split in query()]
+
+    for lang in tqdm([lang for lang in languages if "default" not in lang]):
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"belebele_{args.task_prefix}_{lang}"
+            if args.task_prefix != ""
+            else f"belebele_{lang}",
+            "test_split": lang,
+            "fewshot_split": lang,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
+        logging.info(f"Saving yaml for subset {lang} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
+"fewshot_split": "acm_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_acm_Arab"
+"test_split": "acm_Arab"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
+"fewshot_split": "afr_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_afr_Latn"
+"test_split": "afr_Latn"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_als_Latn.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_als_Latn.yaml
+"fewshot_split": "als_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_als_Latn"
+"test_split": "als_Latn"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
+"fewshot_split": "amh_Ethi"
+"include": "_default_template_yaml"
+"task": "belebele_amh_Ethi"
+"test_split": "amh_Ethi"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
+"fewshot_split": "apc_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_apc_Arab"
+"test_split": "apc_Arab"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
+"fewshot_split": "arb_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Arab"
+"test_split": "arb_Arab"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
+"fewshot_split": "arb_Latn"
+"include": "_default_template_yaml"
+"task": "belebele_arb_Latn"
+"test_split": "arb_Latn"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
+"fewshot_split": "ars_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_ars_Arab"
+"test_split": "ars_Arab"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
+"fewshot_split": "ary_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_ary_Arab"
+"test_split": "ary_Arab"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/belebele/belebele_arz_Arab.yaml
+"fewshot_split": "arz_Arab"
+"include": "_default_template_yaml"
+"task": "belebele_arz_Arab"
+"test_split": "arz_Arab"