merged latest update

0348ed97 · lintangsutawika · 451a1873 · 6769119f · 0348ed97 · 0348ed97
Commit 0348ed97 authored Oct 09, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
+  endif %}'
+include: cot_yaml
+task: mgsm_zh_native_cot
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
+import yaml
+import argparse
+
+
+LANGUAGES = {
+    "bn": {  # Bengali
+        # "QUESTION": "প্রশ্ন:",
+        "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
+        # "ANSWER": "ধাপে ধাপে উত্তর:",
+        "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
+        "DIRECT": "Answer:",
+        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+    },
+    "de": {  # German
+        "QUESTION": "Frage:",
+        # "ANSWER": "Schritt-für-Schritt-Antwort:",
+        "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
+        "DIRECT": "Antwort:",
+        "REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
+    },
+    "en": {  # English
+        "QUESTION": "Question:",
+        "ANSWER": "Step-by-Step Answer:",
+        "DIRECT": "Answer:",
+        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+    },
+    "es": {  # Spanish
+        "QUESTION": "Pregunta:",
+        "ANSWER": "Respuesta paso a paso:",
+        "DIRECT": "Respuesta:",
+        "REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
+    },
+    "fr": {  # French
+        "QUESTION": "Question :",
+        # "ANSWER": "Réponse étape par étape :"
+        "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
+        # "DIRECT": "Réponse :",
+        "DIRECT": "R\u00e9ponse :",
+        # "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
+        "REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
+    },
+    "ru": {  # Russian
+        # "QUESTION": "Задача:",
+        "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
+        # "ANSWER": "Пошаговоерешение:",
+        "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
+        "DIRECT": "Answer:",
+        # "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
+    },
+    "sw": {  # Swahili
+        "QUESTION": "Swali:",
+        "ANSWER": "Jibu la Hatua kwa Hatua:",
+        "DIRECT": "Answer:",
+        "REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
+    },
+    "te": {  # Telugu
+        # "QUESTION": "ప్రశ్న:",
+        "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
+        # "ANSWER": "దశలవారీగా సమాధానం:",
+        "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
+        "DIRECT": "Answer:",
+        # "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
+    },
+    "th": {  # Thai
+        # "QUESTION": "โจทย์:",
+        "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
+        # "ANSWER": "คำตอบทีละขั้นตอน:",
+        "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
+        "DIRECT": "Answer:",
+        # "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
+    },
+    "ja": {  # Japanese
+        # "QUESTION": "問題:",
+        "QUESTION": "\u554f\u984c:",
+        # "ANSWER": "ステップごとの答え:",
+        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
+        "DIRECT": "Answer:",
+        # "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
+        "REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
+    },
+    "zh": {  # Chinese
+        # "QUESTION": "问题:",
+        "QUESTION": "\u95ee\u9898:",
+        # "ANSWER": "逐步解答:",
+        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
+        "DIRECT": "Answer:",
+        # "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
+        "REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
+    },
+}
+
+
+def add_regex_pattern(regex_pattern):
+
+    if regex_pattern is None:
+        return {}
+    return {
+        "filter_list": [
+            {
+                "name": "get-answer",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": regex_pattern,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+        ],
+    }
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        try:
+            QUESTION = LANGUAGES[lang]["QUESTION"]
+
+            yaml_template = "cot_yaml"
+            filter_list = {}
+            if mode == "direct":
+                ANSWER = LANGUAGES[lang]["DIRECT"]
+                REGEX = None
+                task_name = f"mgsm_{lang}_direct"
+                yaml_template = "direct_yaml"
+            elif mode == "native-cot":
+                ANSWER = LANGUAGES[lang]["ANSWER"]
+                REGEX = LANGUAGES[lang]["REGEX"]
+                task_name = f"mgsm_{lang}_native-cot"
+                filter_list = add_regex_pattern(REGEX)
+            elif mode == "en-cot":
+                ANSWER = LANGUAGES["en"]["ANSWER"]
+                REGEX = LANGUAGES["en"]["REGEX"]
+                task_name = f"mgsm_{lang}_en-cot"
+
+            file_name = f"{task_name}.yaml"
+
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": yaml_template,
+                        "dataset_name": lang,
+                        "task": f"mgsm_{lang}_direct",
+                        "doc_to_text": f"""{{% if answer is not none %}}"""
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% endif %}}""",
+                        "doc_to_target": f"""{{% if answer is not none %}}"""
+                        f"""{{{{answer[{len(ANSWER)}+1]}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{answer_number|string}}}}"""
+                        f"""{{% endif %}}""",
+                        **filter_list,
+                    },
+                    f,
+                    allow_unicode=True,
+                    width=float("inf"),
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    parser.add_argument(
+        "--mode",
+        default="native-cot",
+        choices=["direct", "native-cot", "en-cot"],
+        help="Mode of chain-of-thought",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/minerva_math/README.md
+++ b/lm_eval/tasks/minerva_math/README.md
+# MATH
+ℹ️ This is the 4-shot variant!
+## Paper
+Measuring Mathematical Problem Solving With the MATH Dataset
+https://arxiv.org/abs/2103.03874
+
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+
+NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
+
+Homepage: https://github.com/hendrycks/math
+
+
+## Citation
+```
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+
+@misc{2206.14858,
+Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
+Title = {Solving Quantitative Reasoning Problems with Language Models},
+Year = {2022},
+Eprint = {arXiv:2206.14858},
+}
+```
+
+### Groups, Benchmarks and Tasks
+
+#### Benchmarks
+
+- `minerva_math`
+
+#### Groups
+
+- `math_word_problems`
+- `greedy_until`
+
+#### Tasks
+
+- `minerva_math_algebra`
+- `minerva_math_counting_and_prob`
+- `minerva_math_geometry`
+- `minerva_math_intermediate_algebra`
+- `minerva_math_num_theory`
+- `minerva_math_prealgebra`
+- `minerva_math_precalc`
+
+### Checklist
+
+The checklist is the following:
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Variant Wishlist
+
+- [ ] zero-shot variant
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+group:
+  - math_word_problems
+task: minerva_math_algebra
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils.process_docs
+dataset_name: algebra
+output_type: greedy_until
+training_split: train
+test_split: test
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: "{{answer}}"
+generation_kwargs:
+  until:
+    - "Problem:"
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml
+include: minerva_math_algebra.yaml
+dataset_name: counting_and_probability
+task: minerva_math_counting_and_prob
--- a/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml
+include: minerva_math_algebra.yaml
+dataset_name: geometry
+task: minerva_math_geometry
--- a/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml
+include: minerva_math_algebra.yaml
+dataset_name: intermediate_algebra
+task: minerva_math_intermediate_algebra
--- a/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml
+include: minerva_math_algebra.yaml
+dataset_name: number_theory
+task: minerva_math_num_theory
--- a/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml
+include: minerva_math_algebra.yaml
+dataset_name: prealgebra
+task: minerva_math_prealgebra
--- a/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml
+include: minerva_math_algebra.yaml
+dataset_name: precalculus
+task: minerva_math_precalc
--- a/lm_eval/tasks/minerva_math/utils.py
+++ b/lm_eval/tasks/minerva_math/utils.py
+import datasets
+import re
+import signal
+from lm_eval.logger import eval_logger
+from typing import Optional, List, Dict
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise Exception(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    PROMPT = r"""Problem:
+Find the domain of the expression  $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}
+
+Solution:
+The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{[2,5)}$.
+Final Answer: The final answer is $[2,5)$. I hope it is correct.
+
+Problem:
+If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$
+
+Solution:
+We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
+Final Answer: The final answer is $24$. I hope it is correct.
+
+Problem:
+Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
+
+Solution:
+If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight.  If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight.  Equating this to 480 pounds, we can solve for $n$:
+\begin{align*}
+30n&=480\\
+\Rightarrow\qquad n&=480/30=\boxed{16}
+\end{align*}
+Final Answer: The final answer is $16$. I hope it is correct.
+
+Problem:
+If the system of equations
+
+\begin{align*}
+6x-4y&=a,\\
+6y-9x &=b.
+\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
+find $\frac{a}{b},$ assuming $b$ is nonzero.
+
+Solution:
+If we multiply the first equation by $-\frac{3}{2}$, we obtain
+
+$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
+
+$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
+Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct."""
+
+    return PROMPT + "\n\n" + "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["problem"],
+            "solution": doc["solution"],
+            "answer": normalize_final_answer(
+                remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+
+    unnormalized_answer = get_unnormalized_answer(candidates)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer
--- a/lm_eval/tasks/mmlu/_cot_prompts.json
+++ b/lm_eval/tasks/mmlu/_cot_prompts.json
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.logger import eval_logger
+
+SUBJECTS = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="flan")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
+    for subject in tqdm(SUBJECTS):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject]
+        else:
+            description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"mmlu_{args.task_prefix}_{subject}"
+            if args.task_prefix != ""
+            else f"mmlu_{subject}",
+            "dataset_name": subject,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
+group: mmlu
+dataset_path: cais/mmlu
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
--- a/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml
+"dataset_name": "anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_anatomy"
--- a/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml
+"dataset_name": "astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_astronomy"
--- a/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml
+"dataset_name": "business_ethics"
+"description": "The following are multiple choice questions (with answers) about business ethics.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_business_ethics"
--- a/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml
+"dataset_name": "clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_clinical_knowledge"
--- a/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml
+"dataset_name": "college_biology"
+"description": "The following are multiple choice questions (with answers) about college biology.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_college_biology"