Merge branch 'main' into weight_by_size

9822b06e · Lintang Sutawika · GitHub · 51f27158 · b177c82c · 9822b06e
Unverified Commit 9822b06e authored Mar 01, 2024 by Lintang Sutawika Committed by GitHub Mar 01, 2024
20 changed files
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: 答案是 (\-?[0-9\.\,]+)。
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '问题:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_zh
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
-import yaml
 import argparse

+import yaml
+

 LANGUAGES = {
    "bn": {  # Bengali
@@ -99,11 +100,24 @@ def add_regex_pattern(regex_pattern):
    return {
        "filter_list": [
            {
-                "name": "get-answer",
+                "name": "strict-match",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": f"""{regex_pattern}""",
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+            {
+                "name": "flexible-extract",
                "filter": [
                    {
                        "function": "regex",
-                        "regex_pattern": regex_pattern,
+                        "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
+                        "group_select": -1,
                    },
                    {
                        "function": "take_first",
@@ -128,23 +142,25 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:

            yaml_template = "cot_yaml"
            filter_list = {}
+            DELIMITER = None
            if mode == "direct":
                ANSWER = LANGUAGES[lang]["DIRECT"]
                REGEX = None
-                task_name = f"mgsm_{lang}_direct"
+                task_name = f"mgsm_direct_{lang}"
                yaml_template = "direct_yaml"
            elif mode == "native-cot":
                ANSWER = LANGUAGES[lang]["ANSWER"]
                REGEX = LANGUAGES[lang]["REGEX"]
-                task_name = f"mgsm_{lang}_native-cot"
+                task_name = f"mgsm_native_cot_{lang}"
                filter_list = add_regex_pattern(REGEX)
+                DELIMITER = "" if lang in ["zh", "ja"] else None
            elif mode == "en-cot":
                ANSWER = LANGUAGES["en"]["ANSWER"]
                REGEX = LANGUAGES["en"]["REGEX"]
-                task_name = f"mgsm_{lang}_en-cot"
+                task_name = f"mgsm_en_cot_{lang}"

            file_name = f"{task_name}.yaml"
-
+            ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
            with open(
                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
            ) as f:
@@ -153,18 +169,23 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                    {
                        "include": yaml_template,
                        "dataset_name": lang,
-                        "task": f"mgsm_{lang}_direct",
+                        "task": f"{task_name}",
                        "doc_to_text": f"""{{% if answer is not none %}}"""
                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
                        f"""{{% else %}}"""
                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
                        f"""{{% endif %}}""",
                        "doc_to_target": f"""{{% if answer is not none %}}"""
-                        f"""{{{{answer[{len(ANSWER)}+1]}}}}"""
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
                        f"""{{% else %}}"""
                        f"""{{{{answer_number|string}}}}"""
                        f"""{{% endif %}}""",
                        **filter_list,
+                        "generation_kwargs": {
+                            "until": [QUESTION, "</s>", "<|im_end|>"],
+                            "do_sample": False,
+                        },
+                        **({"target_delimiter": DELIMITER} if DELIMITER else {}),
                    },
                    f,
                    allow_unicode=True,

--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -22,3 +22,4 @@ metric_list:
 num_fewshot: 0
 metadata:
  version: 1.0
+  num_fewshot: 4
--- a/lm_eval/tasks/minerva_math/utils.py
+++ b/lm_eval/tasks/minerva_math/utils.py
-import datasets
 import re
 import signal
+from typing import Dict, List, Optional
+
+import datasets
+
 from lm_eval.utils import eval_logger
-from typing import Optional, List, Dict
+

 try:
    import sympy
    from sympy.parsing.latex import parse_latex
 except ModuleNotFoundError:
-    raise Exception(
+    raise ModuleNotFoundError(
        "`sympy` is required for generating translation task prompt templates. \
 please install sympy via pip install lm-eval[math] or pip install -e .[math]",
    )

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
-import os
-import yaml
 import argparse
+import os

+import yaml
 from tqdm import tqdm

 from lm_eval.logger import eval_logger

+
 SUBJECTS = {
    "abstract_algebra": "stem",
    "anatomy": "stem",
@@ -124,7 +125,6 @@ if __name__ == "__main__":
            yaml.dump(
                yaml_dict,
                yaml_file,
-                # width=float("inf"),
                allow_unicode=True,
                default_style='"',
            )

--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -5,14 +5,24 @@ output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:
-  - name: "get-answer"
+  - name: "strict-match"
    filter:
      - function: "regex"
        regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
 generation_kwargs:
  until:
    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
 num_fewshot: 0
@@ -23,4 +33,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+import re
+import sys
+import unicodedata
+
+from lm_eval.filters.extraction import RegexFilter
+
+
+class MultiChoiceRegexFilter(RegexFilter):
+    """ """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+
+        punct_tbl = dict.fromkeys(
+            i
+            for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith("P")
+        )
+
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(
+                        fallback_regex, filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -5,12 +5,26 @@ fewshot_split: dev
 output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        regex_pattern: "(\\([A-Z]\\))"
+        ignore_case: true
+        ignore_punctuation: true
+      - function: "take_first"
 generation_kwargs:
  until:
    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+import re
+import sys
+import unicodedata
+
+from lm_eval.filters.extraction import RegexFilter
+
+
+class MultiChoiceRegexFilter(RegexFilter):
+    """ """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+
+        punct_tbl = dict.fromkeys(
+            i
+            for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith("P")
+        )
+
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+
+            if self.ignore_case:
+                st = st.lower()
+
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                f":[\s]*({without_paren_fallback_regex})"
+            )
+
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(
+                        fallback_regex, filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+
+        return filtered_resps
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
-import yaml
 import datasets
-
+import yaml
 from tqdm import tqdm



--- a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
-import yaml
 import datasets
-
+import yaml
 from tqdm import tqdm



--- a/lm_eval/tasks/okapi/arc_multilingual/README.md
+++ b/lm_eval/tasks/okapi/arc_multilingual/README.md
+# Multilingual ARC
+
+### Paper
+
+Title: `Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback`
+
+Abstract: https://arxiv.org/abs/2307.16039
+
+A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at this https URL.
+
+Homepage: `https://github.com/nlp-uoregon/Okapi`
+
+
+### Citation
+
+```
+@article{dac2023okapi,
+  title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
+  author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu},
+  journal={arXiv e-prints},
+  pages={arXiv--2307},
+  year={2023}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- arc_multilingual
+
+#### Tasks
+
+- `arc_{ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh}`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/_arc_yaml
+group:
+  - arc_multilingual
+dataset_path: null
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "query"
+doc_to_target: "gold"
+doc_to_choice: "choices"
+should_decontaminate: true
+doc_to_decontamination_query: "query"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml
+include: _arc_yaml
+task: arc_ar
+dataset_path: alexandrainst/m_arc
+dataset_name: ar
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml
+include: _arc_yaml
+task: arc_bn
+dataset_path: alexandrainst/m_arc
+dataset_name: bn
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml
+include: _arc_yaml
+task: arc_ca
+dataset_path: alexandrainst/m_arc
+dataset_name: ca
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml
+include: _arc_yaml
+task: arc_da
+dataset_path: alexandrainst/m_arc
+dataset_name: da
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml
+include: _arc_yaml
+task: arc_de
+dataset_path: alexandrainst/m_arc
+dataset_name: de
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml
+include: _arc_yaml
+task: arc_es
+dataset_path: alexandrainst/m_arc
+dataset_name: es
+training_split: train
+validation_split: validation
+test_split: test
--- a/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+++ b/lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml
+include: _arc_yaml
+task: arc_eu
+dataset_path: alexandrainst/m_arc
+dataset_name: eu
+training_split: train
+validation_split: validation
+test_split: test