Commit 6ac42518 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into openai_completions
parents 9c3ba7d4 e3644fcc
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_sw_direct
task: mgsm_sw_native_cot
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
%}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_te_direct
task: mgsm_te_native_cot
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
%}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_th_direct
task: mgsm_th_native_cot
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_zh_direct
task: mgsm_zh_native_cot
......@@ -4,16 +4,19 @@ import argparse
LANGUAGES = {
"bn": { # Bengali
# "QUESTION": "প্রশ্ন:",
"QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
# "ANSWER": "ধাপে ধাপে উত্তর:",
"ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
},
"de": { # German
"QUESTION": "Frage:",
# "ANSWER": "Schritt-für-Schritt-Antwort:",
"ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
"DIRECT": "Antwort:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
},
"en": { # English
"QUESTION": "Question:",
......@@ -24,50 +27,68 @@ LANGUAGES = {
"es": { # Spanish
"QUESTION": "Pregunta:",
"ANSWER": "Respuesta paso a paso:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"DIRECT": "Respuesta:",
"REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
},
"fr": { # French
"QUESTION": "Question :",
# "ANSWER": "Réponse étape par étape :"
"ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "DIRECT": "Réponse :",
"DIRECT": "R\u00e9ponse :",
# "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
"REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
},
"ru": { # Russian
# "QUESTION": "Задача:",
"QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
# "ANSWER": "Пошаговоерешение:",
"ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
"REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
},
"sw": { # Swahili
"QUESTION": "Swali:",
"ANSWER": "Jibu la Hatua kwa Hatua:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
},
"te": { # Telugu
# "QUESTION": "ప్రశ్న:",
"QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
# "ANSWER": "దశలవారీగా సమాధానం:",
"ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
"REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
},
"th": { # Thai
# "QUESTION": "โจทย์:",
"QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
# "ANSWER": "คำตอบทีละขั้นตอน:",
"ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
"REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
},
"ja": { # Japanese
# "QUESTION": "問題:",
"QUESTION": "\u554f\u984c:",
# "ANSWER": "ステップごとの答え:",
"ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
"REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
},
"zh": { # Chinese
# "QUESTION": "问题:",
"QUESTION": "\u95ee\u9898:",
# "ANSWER": "逐步解答:",
"ANSWER": "\u9010\u6b65\u89e3\u7b54:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
"REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
},
}
......@@ -80,15 +101,15 @@ def add_regex_pattern(regex_pattern):
"filter_list": [
{
"name": "get-answer",
},
],
"filter": [
{
"function": "regex",
"regex_pattern": regex_pattern,
},
{
"function": "take_first",
"filter": [
{
"function": "regex",
"regex_pattern": regex_pattern,
},
{
"function": "take_first",
},
],
},
],
}
......@@ -107,6 +128,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
QUESTION = LANGUAGES[lang]["QUESTION"]
yaml_template = "cot_yaml"
filter_list = {}
if mode == "direct":
ANSWER = LANGUAGES[lang]["DIRECT"]
REGEX = None
......@@ -116,13 +138,13 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
ANSWER = LANGUAGES[lang]["ANSWER"]
REGEX = LANGUAGES[lang]["REGEX"]
task_name = f"mgsm_{lang}_native-cot"
filter_list = add_regex_pattern(REGEX)
elif mode == "en-cot":
ANSWER = LANGUAGES["en"]["ANSWER"]
REGEX = LANGUAGES["en"]["REGEX"]
task_name = f"mgsm_{lang}_en-cot"
file_name = f"{task_name}.yaml"
filter_list = add_regex_pattern(REGEX)
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
......@@ -147,6 +169,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
},
f,
allow_unicode=True,
width=float("inf"),
)
except FileExistsError:
err.append(file_name)
......
# MATH
ℹ️ This is the 4-shot variant!
## Paper
Measuring Mathematical Problem Solving With the MATH Dataset
https://arxiv.org/abs/2103.03874
Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
Homepage: https://github.com/hendrycks/math
## Citation
```
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
@misc{2206.14858,
Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
Title = {Solving Quantitative Reasoning Problems with Language Models},
Year = {2022},
Eprint = {arXiv:2206.14858},
}
```
### Groups, Benchmarks and Tasks
#### Benchmarks
- `minerva_math`
#### Groups
- `math_word_problems`
- `generate_until`
#### Tasks
- `minerva_math_algebra`
- `minerva_math_counting_and_prob`
- `minerva_math_geometry`
- `minerva_math_intermediate_algebra`
- `minerva_math_num_theory`
- `minerva_math_prealgebra`
- `minerva_math_precalc`
### Checklist
The checklist is the following:
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
### Variant Wishlist
- [ ] zero-shot variant
group:
- math_word_problems
task: minerva_math_algebra
dataset_path: EleutherAI/hendrycks_math
process_docs: !function utils.process_docs
dataset_name: algebra
output_type: generate_until
training_split: train
test_split: test
doc_to_text: !function utils.doc_to_text
process_results: !function utils.process_results
doc_to_target: "{{answer}}"
generation_kwargs:
until:
- "Problem:"
do_sample: false
temperature: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
include: minerva_math_algebra.yaml
dataset_name: counting_and_probability
task: minerva_math_counting_and_prob
include: minerva_math_algebra.yaml
dataset_name: geometry
task: minerva_math_geometry
include: minerva_math_algebra.yaml
dataset_name: intermediate_algebra
task: minerva_math_intermediate_algebra
include: minerva_math_algebra.yaml
dataset_name: number_theory
task: minerva_math_num_theory
include: minerva_math_algebra.yaml
dataset_name: prealgebra
task: minerva_math_prealgebra
include: minerva_math_algebra.yaml
dataset_name: precalculus
task: minerva_math_precalc
import datasets
import re
import signal
from lm_eval.utils import eval_logger
from typing import Optional, List, Dict
try:
import sympy
from sympy.parsing.latex import parse_latex
except ModuleNotFoundError:
raise Exception(
"`sympy` is required for generating translation task prompt templates. \
please install sympy via pip install lm-eval[math] or pip install -e .[math]",
)
# taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str:
PROMPT = r"""Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}
Solution:
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.
Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.
Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.
Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct."""
return PROMPT + "\n\n" + "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["problem"],
"solution": doc["solution"],
"answer": normalize_final_answer(
remove_boxed(last_boxed_only_string(doc["solution"]))
),
}
return out_doc
return dataset.map(_process_doc)
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
candidates = results[0]
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)
if is_equiv(answer, doc["answer"]):
retval = 1
else:
retval = 0
results = {
"exact_match": retval,
}
return results
def last_boxed_only_string(string: str) -> Optional[str]:
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def remove_boxed(s: str) -> str:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
class timeout:
def __init__(self, seconds=1, error_message="Timeout"):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum, frame):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, type, value, traceback):
signal.alarm(0)
def is_equiv(x1: str, x2: str) -> bool:
"""
x1 and x2 are normalized latex string
"""
try:
with timeout(seconds=5):
try:
parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2)
except (
sympy.parsing.latex.errors.LaTeXParsingError,
sympy.SympifyError,
TypeError,
):
eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
return False
try:
diff = parsed_x1 - parsed_x2
except TypeError:
eval_logger.debug(f"couldn't subtract {x1} and {x2}")
return False
try:
if sympy.simplify(diff) == 0:
return True
else:
return False
except ValueError:
eval_logger.debug(
f"Had some trouble simplifying when comparing {x1} and {x2}"
)
except TimeoutError:
eval_logger.debug(f"Timed out comparing {x1} and {x2}")
return False
except ImportError as e:
eval_logger.error(e)
raise
except Exception as e:
eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
return False
def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct."
text += end_seq
match = re.search(
r"Final Answer: The final answer is(.*?). I hope it is correct.",
text,
)
if match:
return match.group(1).strip()
else:
return INVALID_ANSWER
SUBSTITUTIONS = [
("an ", ""),
("a ", ""),
(".$", "$"),
("\\$", ""),
(r"\ ", ""),
(" ", ""),
("mbox", "text"),
(",\\text{and}", ","),
("\\text{and}", ","),
("\\text{m}", "\\text{}"),
]
REMOVED_EXPRESSIONS = [
"square",
"ways",
"integers",
"dollars",
"mph",
"inches",
"ft",
"hours",
"km",
"units",
"\\ldots",
"sue",
"points",
"feet",
"minutes",
"digits",
"cents",
"degrees",
"cm",
"gm",
"pounds",
"meters",
"meals",
"edges",
"students",
"childrentickets",
"multiples",
"\\text{s}",
"\\text{.}",
"\\text{\ns}",
"\\text{}^2",
"\\text{}^3",
"\\text{\n}",
"\\text{}",
r"\mathrm{th}",
r"^\circ",
r"^{\circ}",
r"\;",
r",\!",
"{,}",
'"',
"\\dots",
]
def normalize_final_answer(final_answer: str) -> str:
"""
Normalize a final answer to a quantitative reasoning question.
Copied character for character from appendix D of Lewkowycz et al. (2022)
"""
final_answer = final_answer.split("=")[-1]
for before, after in SUBSTITUTIONS:
final_answer = final_answer.replace(before, after)
for expr in REMOVED_EXPRESSIONS:
final_answer = final_answer.replace(expr, "")
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
final_answer = final_answer.replace("$", "")
# Normalize 100,000 -> 100000
if final_answer.replace(",", "").isdigit():
final_answer = final_answer.replace(",", "")
return final_answer
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import os
import yaml
import argparse
from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
SUBJECTS = {
"abstract_algebra": "stem",
"anatomy": "stem",
"astronomy": "stem",
"business_ethics": "other",
"clinical_knowledge": "other",
"college_biology": "stem",
"college_chemistry": "stem",
"college_computer_science": "stem",
"college_mathematics": "stem",
"college_medicine": "other",
"college_physics": "stem",
"computer_security": "stem",
"conceptual_physics": "stem",
"econometrics": "social_sciences",
"electrical_engineering": "stem",
"elementary_mathematics": "stem",
"formal_logic": "humanities",
"global_facts": "other",
"high_school_biology": "stem",
"high_school_chemistry": "stem",
"high_school_computer_science": "stem",
"high_school_european_history": "humanities",
"high_school_geography": "social_sciences",
"high_school_government_and_politics": "social_sciences",
"high_school_macroeconomics": "social_sciences",
"high_school_mathematics": "stem",
"high_school_microeconomics": "social_sciences",
"high_school_physics": "stem",
"high_school_psychology": "social_sciences",
"high_school_statistics": "stem",
"high_school_us_history": "humanities",
"high_school_world_history": "humanities",
"human_aging": "other",
"human_sexuality": "social_sciences",
"international_law": "humanities",
"jurisprudence": "humanities",
"logical_fallacies": "humanities",
"machine_learning": "stem",
"management": "other",
"marketing": "other",
"medical_genetics": "other",
"miscellaneous": "other",
"moral_disputes": "humanities",
"moral_scenarios": "humanities",
"nutrition": "other",
"philosophy": "humanities",
"prehistory": "humanities",
"professional_accounting": "other",
"professional_law": "humanities",
"professional_medicine": "other",
"professional_psychology": "social_sciences",
"public_relations": "social_sciences",
"security_studies": "social_sciences",
"sociology": "social_sciences",
"us_foreign_policy": "social_sciences",
"virology": "other",
"world_religions": "humanities",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="mmlu")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"mmlu_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"mmlu_{category}",
"group_alias": category.replace("_", " "),
"task": f"mmlu_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"mmlu_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"mmlu_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"mmlu_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"mmlu_{args.task_prefix}"
if args.task_prefix != ""
else "mmlu",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group: mmlu
task:
- mmlu_stem
- mmlu_other
- mmlu_social_sciences
- mmlu_humanities
"dataset_name": "abstract_algebra"
"description": "The following are multiple choice questions (with answers) about abstract\
\ algebra.\n\n"
"group": "mmlu_stem"
"group_alias": "stem"
"include": "_default_template_yaml"
"task": "mmlu_abstract_algebra"
"task_alias": "abstract_algebra"
"dataset_name": "anatomy"
"description": "The following are multiple choice questions (with answers) about anatomy.\n\
\n"
"group": "mmlu_stem"
"group_alias": "stem"
"include": "_default_template_yaml"
"task": "mmlu_anatomy"
"task_alias": "anatomy"
"dataset_name": "astronomy"
"description": "The following are multiple choice questions (with answers) about astronomy.\n\
\n"
"group": "mmlu_stem"
"group_alias": "stem"
"include": "_default_template_yaml"
"task": "mmlu_astronomy"
"task_alias": "astronomy"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment