Unverified Commit f88ffeee authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge branch 'big-refactor' into add-fewshot-config

parents 2d5d94da 0f6cd358
include: minerva_math_algebra.yaml
dataset_name: number_theory
task: minerva_math_num_theory
include: minerva_math_algebra.yaml
dataset_name: prealgebra
task: minerva_math_prealgebra
include: minerva_math_algebra.yaml
dataset_name: precalculus
task: minerva_math_precalc
import datasets
import re
import sympy
from sympy.parsing.latex import parse_latex
import signal
from lm_eval.logger import eval_logger
from typing import Optional, List, Dict
# taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str:
PROMPT = r"""Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}
Solution:
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.
Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.
Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.
Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct."""
return PROMPT + "\n\n" + "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["problem"],
"solution": doc["solution"],
"answer": normalize_final_answer(
remove_boxed(last_boxed_only_string(doc["solution"]))
),
}
return out_doc
return dataset.map(_process_doc)
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
candidates = results[0]
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)
if is_equiv(answer, doc["answer"]):
retval = 1
else:
retval = 0
results = {
"exact_match": retval,
}
return results
def last_boxed_only_string(string: str) -> Optional[str]:
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def remove_boxed(s: str) -> str:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
class timeout:
def __init__(self, seconds=1, error_message="Timeout"):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum, frame):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, type, value, traceback):
signal.alarm(0)
def is_equiv(x1: str, x2: str) -> bool:
"""
x1 and x2 are normalized latex string
"""
try:
with timeout(seconds=5):
try:
parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2)
except (
sympy.parsing.latex.errors.LaTeXParsingError,
sympy.SympifyError,
TypeError,
):
eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
return False
try:
diff = parsed_x1 - parsed_x2
except TypeError:
eval_logger.debug(f"couldn't subtract {x1} and {x2}")
return False
try:
if sympy.simplify(diff) == 0:
return True
else:
return False
except ValueError:
eval_logger.debug(
f"Had some trouble simplifying when comparing {x1} and {x2}"
)
except TimeoutError:
eval_logger.debug(f"Timed out comparing {x1} and {x2}")
return False
except ImportError as e:
eval_logger.error(e)
raise
except Exception as e:
eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
return False
def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct."
text += end_seq
match = re.search(
r"Final Answer: The final answer is(.*?). I hope it is correct.",
text,
)
if match:
return match.group(1).strip()
else:
return INVALID_ANSWER
SUBSTITUTIONS = [
("an ", ""),
("a ", ""),
(".$", "$"),
("\\$", ""),
(r"\ ", ""),
(" ", ""),
("mbox", "text"),
(",\\text{and}", ","),
("\\text{and}", ","),
("\\text{m}", "\\text{}"),
]
REMOVED_EXPRESSIONS = [
"square",
"ways",
"integers",
"dollars",
"mph",
"inches",
"ft",
"hours",
"km",
"units",
"\\ldots",
"sue",
"points",
"feet",
"minutes",
"digits",
"cents",
"degrees",
"cm",
"gm",
"pounds",
"meters",
"meals",
"edges",
"students",
"childrentickets",
"multiples",
"\\text{s}",
"\\text{.}",
"\\text{\ns}",
"\\text{}^2",
"\\text{}^3",
"\\text{\n}",
"\\text{}",
r"\mathrm{th}",
r"^\circ",
r"^{\circ}",
r"\;",
r",\!",
"{,}",
'"',
"\\dots",
]
def normalize_final_answer(final_answer: str) -> str:
"""
Normalize a final answer to a quantitative reasoning question.
Copied character for character from appendix D of Lewkowycz et al. (2022)
"""
final_answer = final_answer.split("=")[-1]
for before, after in SUBSTITUTIONS:
final_answer = final_answer.replace(before, after)
for expr in REMOVED_EXPRESSIONS:
final_answer = final_answer.replace(expr, "")
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
final_answer = final_answer.replace("$", "")
# Normalize 100,000 -> 100000
if final_answer.replace(",", "").isdigit():
final_answer = final_answer.replace(",", "")
return final_answer
# MuTual
### Paper
Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
### Citation
```
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `mutual`
* `mutual_plus`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
include: mutual.yaml
task: mutual_plus
dataset_name: mutual_plus
task: mutual
dataset_path: "EleutherAI/mutual"
dataset_name: mutual
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{article}}"
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
doc_to_choice: "{{options}}"
process_docs: !function utils.process_docs
process_results: !function utils.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{article}}"
metric_list:
- metric: r@1
aggregation: mean
higher_is_better: true
- metric: r@2
aggregation: mean
higher_is_better: true
- metric: mrr
aggregation: mean
higher_is_better: true
import numpy as np
def process_docs(dataset):
def _detokenize(text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
text = text.replace("\n ", "\n")
text = text.replace(" n't", "n't")
text = text.replace("`` ", '"')
text = text.replace("''", '"')
# punctuation
text = text.replace(" :", ":")
text = text.replace(" ;", ";")
text = text.replace(" !", "!")
text = text.replace(" ?", "?")
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
return text
def _process(doc):
return {
"article": _detokenize(doc["article"]),
"options": [_detokenize(option) for option in doc["options"]],
}
return dataset.map(_process)
def process_results(doc, results):
gold = ["A", "B", "C", "D"].index(doc["answers"])
r4_1 = np.argmax(results) == gold # r4_1 = accuracy
ranks = sorted(results, reverse=True)
r4_2 = (ranks.index(results[gold]) == 1) + r4_1
mrr = 1.0 / (ranks.index(results[gold]) + 1) # `+ 1` for index offset
return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
# QASPER
### Paper
Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
Abstract: https://arxiv.org/abs/2105.03011
QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
Each question is written by an NLP practitioner who read only the title and abstract
of the corresponding paper, and the question seeks information present in the full
text. The questions are then answered by a separate set of NLP practitioners who also
provide supporting evidence to answers.
Homepage: https://allenai.org/data/qasper
### Citation
```
@article{DBLP:journals/corr/abs-2105-03011,
author = {Pradeep Dasigi and
Kyle Lo and
Iz Beltagy and
Arman Cohan and
Noah A. Smith and
Matt Gardner},
title = {A Dataset of Information-Seeking Questions and Answers Anchored in
Research Papers},
journal = {CoRR},
volume = {abs/2105.03011},
year = {2021},
url = {https://arxiv.org/abs/2105.03011},
eprinttype = {arXiv},
eprint = {2105.03011},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
```
### Groups and Tasks
#### Groups
* `qasper`: executes both `qasper_bool` and `qasper_freeform`
#### Tasks
* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: qasper
task: qasper_bool
dataset_path: qasper
output_type: multiple_choice
training_split: train
validation_split: validation
process_docs: !function utils.process_docs_bool
doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
doc_to_target: 1
doc_to_choice: ["no", "yes"]
metric_list:
- metric: f1
group: qasper
task: qasper_freeform
dataset_path: qasper
output_type: greedy_until
training_split: train
validation_split: validation
process_docs: !function utils.process_docs_freeform
doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
doc_to_target: answer
generation_kwargs:
until:
- "\n"
metric_list:
- metric: !function metrics.f1_abstractive
aggregation: mean
higher_is_better: true
import re
import string
from collections import Counter
def normalize_answer(s):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
Lower text and remove punctuation, articles and extra whitespace.
"""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_abstractive(predictions, references):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
"""
prediction_tokens = normalize_answer(predictions[0]).split()
references_tokens = normalize_answer(references[0]).split()
common = Counter(prediction_tokens) & Counter(references_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(references_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
from datasets import Dataset
from functools import partial
def process_docs(dataset, set_answer_type="bool"):
FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
def _categorise_answer(answer_blob):
if answer_blob["unanswerable"]:
answer = "unanswerable"
answer_type = "unanswerable"
return answer, answer_type
elif answer_blob["yes_no"]:
answer = "yes"
answer_type = "bool"
return answer, answer_type
elif answer_blob["free_form_answer"]:
answer = answer_blob["free_form_answer"]
answer_type = "free form answer"
return answer, answer_type
elif answer_blob["extractive_spans"]:
answer = answer_blob["extractive_spans"]
answer_type = "extractive_spans"
return answer, answer_type
elif answer_blob["yes_no"] is False:
answer = "no"
answer_type = "bool"
return answer, answer_type
def _flatten(doc):
"""Given a `doc`, flatten it out so that each JSON blob
contains exactly one question and one answer. Logic taken from
the reference implementation available at
https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
"""
obs_list = {
"title": [],
"abstract": [],
"question": [],
"answer": [],
"answer_type": [],
}
title = doc.pop("title")
abstract = doc.pop("abstract")
for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
for answer_blob in answer_list["answer"]:
answer, answer_type = _categorise_answer(answer_blob)
if answer_type == set_answer_type:
obs_list["title"].append(title)
obs_list["abstract"].append(abstract)
obs_list["question"].append(question)
obs_list["answer_type"].append(answer_type)
if type(answer) == list:
answer = ", ".join(answer)
obs_list["answer"].append(answer)
return obs_list
dataset = dataset.map(
_flatten,
remove_columns=[key for key in dataset.features.keys() if key not in FEATURES],
)
new_dataset = {}
for key in dataset.features.keys():
new_dataset[key] = [x for row in dataset[key] for x in row]
return Dataset.from_dict(new_dataset)
process_docs_bool = partial(process_docs, set_answer_type="bool")
process_docs_freeform = partial(process_docs, set_answer_type="free form answer")
# Task-name
### Paper
Title: `paper title goes here`
Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:`
Homepage: `homepage to the benchmark's website goes here, if applicable`
### Citation
```
BibTeX-formatted citation goes here
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: .....
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: squadv2
dataset_path: squad_v2
output_type: greedy_until
training_split: train
validation_split: validation
doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
target_delimiter: ""
should_decontaminate: true
doc_to_decontamination_query: context
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
include: default.yaml
task: squadv2_noans_loglikelihood
dataset_path: squad_v2
output_type: loglikelihood
training_split: train
validation_split: validation
doc_to_target: " unanswerable"
metric_list:
- metric: perplexity
import re
import string
import collections
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
# Exact match (the normalized answer exactly match the gold answer)
def exact(predictions, references):
return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
# The F-score of predicted tokens versus the gold answer
def f1(predictions, references):
gold_toks = get_tokens(references[0])
pred_toks = get_tokens(predictions[0])
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
group: squadv2_complete
task:
- squadv2
- squadv2_noans_loglikelihood
......@@ -10,7 +10,7 @@ try:
except ModuleNotFoundError:
raise Exception(
"`pycountry` is required for generating translation task prompt templates. \
please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
)
......
......@@ -16,7 +16,6 @@ import gc
import torch
import transformers
from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice
......@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
args_string = args_string.strip()
if not args_string:
return {}
arg_list = args_string.split(",")
args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
arg_list = [arg for arg in args_string.split(",") if arg]
args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
return args_dict
......@@ -395,8 +394,10 @@ def import_function(loader, node):
function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name)
module_name, function_name = function_name.split(".")
module_path = os.path.join(yaml_path, "{}.py".format(module_name))
*module_name, function_name = function_name.split(".")
if type(module_name) == list:
module_name = ".".join(module_name)
module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
......@@ -430,8 +431,7 @@ def load_yaml_config(yaml_path):
# If not found, assume the included yaml
# is in the same dir as the original yaml
if not os.path.isfile(path):
path = os.path.join(yaml_dir, path)
path = os.path.normpath(os.path.join(yaml_dir, path))
try:
included_yaml_config = load_yaml_config(path)
final_yaml_config.update(included_yaml_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment