Unverified Commit 3b4fa26e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'big-refactor' into wmt

parents d01cc479 8f448eed
......@@ -6,9 +6,9 @@ dataset_name: copa
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
doc_to_choice: ['choice1', 'choice2']
metric_list:
- metric: exact_match
aggregation: mean
......
group:
- super-glue-t5-prompt
task: super_glue-multirc-t5-prompt
dataset_path: super_glue
dataset_name: multirc
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
doc_to_target: label
doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.5
metric_list:
- metric: !function t5_utils.f1
aggregation: !function t5_utils.agg_f1
higher_is_better: true
- metric: !function t5_utils.em
aggregation: !function t5_utils.agg_em
higher_is_better: true
import collections
import numpy as np
import sklearn.metrics
def f1(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_reference = references[0].split("_")[-1]
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (prediction, reference)
def agg_f1(items):
predictions, references = zip(*items)
references, predictions = np.asarray(references), np.asarray(predictions)
return sklearn.metrics.f1_score(references, predictions)
def em(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_group, _reference = references[0].split("_")
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (_group, prediction, reference)
def agg_em(items):
grouped_values = collections.defaultdict(lambda: ([], []))
for group, prediction, reference in items:
grouped_values[group][0].append(reference)
grouped_values[group][1].append(prediction)
group_scores = []
for group, (targets, predictions) in grouped_values.items():
score = float(np.array_equal(targets, predictions))
group_scores.append(score)
return np.mean(group_scores)
......@@ -3,14 +3,15 @@ group:
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_name: record
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
doc_to_target: "{{answers}}"
process_docs: !function t5_utils.process_docs
doc_to_text: !function t5_utils.doc_to_text
doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
metric_list:
- metric: exact_match
aggregation: mean
- metric: !function t5_utils.em
aggregation: !function t5_utils.squad_em_agg
higher_is_better: true
- metric: !function t5_utils.f1
aggregation: !function t5_utils.squad_f1_agg
higher_is_better: true
ignore_case: true
ignore_punctuation: true
import re
import string
import collections
import numpy as np
from tqdm import tqdm
from datasets import Dataset, concatenate_datasets
from lm_eval.api.metrics import metric_max_over_ground_truths
def doc_to_text(doc):
passage = doc["passage"]
passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
passage = re.sub(r"\n@highlight\n", ". ", passage)
return " ".join(
[
"record query:",
doc["query"],
"entities:",
", ".join(doc["entities"]),
"passage:",
passage,
]
)
def process_docs(dataset):
def split_answers(doc):
split_doc = {
**{k: [] for k in doc.keys()},
}
answers = doc.pop("answers")
for idx, answer in enumerate(answers):
for key in split_doc.keys():
if key in doc:
split_doc[key].append(doc[key])
split_doc["answers"].append(answer)
return split_doc
dataset = dataset.map(split_answers)
new_dataset = {}
for key in dataset.features.keys():
new_dataset[key] = [x for row in dataset[key] for x in row]
return Dataset.from_dict(new_dataset)
def normalize_squad(answer):
"""Normalization used in official SQuAD evaluation script."""
def _normalize_answer(text, punc_chars, punc_repl):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(s):
return re.sub(r"\b(a|an|the)\b", " ", s)
def replace_punctuation(s):
to_replace = set(punc_chars)
return "".join(punc_repl if ch in to_replace else ch for ch in s)
def white_space_fix(s):
return " ".join(s.split())
text = text.lower()
text = replace_punctuation(text)
text = remove_articles(text)
text = white_space_fix(text)
return text
return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
def em(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def f1(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def squad_em_agg(items):
def _exact_match_score(prediction, target):
return target == prediction
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
# if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
em = []
for group in grouped_values.keys():
predictions, targets = grouped_values[group]
for p in predictions:
em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
return np.mean(em)
def squad_f1_agg(items):
def _f1_score(prediction, target):
"""Computes token f1 score for a single target and prediction."""
prediction_tokens = prediction.split()
target_tokens = target.split()
common = collections.Counter(prediction_tokens) & collections.Counter(
target_tokens
)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(target_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
f1 = []
for group in grouped_values.keys():
p, t = grouped_values[group]
f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
return np.mean(f1)
group:
- super-glue-t5-prompt
task: super_glue-rte-t5-prompt
dataset_path: super_glue
dataset_name: rte
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'not_entailment']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
group:
- super-glue-t5-prompt
task: super_glue-wic-t5-prompt
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
......@@ -2,7 +2,7 @@ group:
- super-glue-lm-eval-v1
task: wsc
dataset_path: super_glue
dataset_name: wsc
dataset_name: wsc.fixed
output_type: multiple_choice
training_split: train
validation_split: validation
......
import re
from lm_eval.utils import general_detokenize
def t5_prompt_doc_to_text(x):
def _mark_span(text, span_str, span_idx, mark):
pattern_tmpl = r"^((?:\S+\s){N})(W)"
pattern = re.sub("N", str(span_idx), pattern_tmpl)
pattern = re.sub("W", span_str, pattern)
return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
text = x["text"]
text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
# Compensate for 2 added "words" added in previous step.
span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
text = _mark_span(text, x["span2_text"], span2_index, "#")
return text
def default_doc_to_text(x):
raw_passage = x["text"]
# NOTE: HuggingFace span indices are word-based not character-based.
......
......@@ -2,16 +2,17 @@ group:
- super-glue-t5-prompt
task: super_glue-wsc-t5-prompt
dataset_path: super_glue
dataset_name: wsc
dataset_name: wsc.fixed
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
doc_to_text: !function "t5_utils.doc_to_text"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
- metric: accuracy
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
filter_list:
- name: "wsc_postprocessor"
filter:
- function: !function t5_utils.WSCPostprocess
import re
from lm_eval.api.filter import Filter
def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
return "wsc: " + text
def _wsc_inputs(x):
words = x["text"].split(" ")
# We would need some special logic to handle the case where the pronoun is the
# first or last word in the text. None of the examples in WSC seem to have
# this, so we are ignoring these cases.
assert x["span2_index"] > 0
assert x["span2_index"] < len(words)
pronoun_index = x["span2_index"]
def create_input():
assert words[pronoun_index] == x["span2_text"]
return " ".join(
[
" ".join(words[:pronoun_index]),
"X",
" ".join(words[pronoun_index + 1 :]),
]
)
# Handle some special cases.
if (
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
):
return (
"The boy continued to whip the pony , and eventually the pony threw "
'him over. John laughed out quite loud. "Good for X ," he said.'
)
# Using the span2_index, we get 'use' instead of 'it'.
if (
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
):
return (
"When they had eventually calmed down a bit , and had gotten home, "
"Mr. Farley put the magic pebble in an iron safe . Some day they might "
"want to use X , but really for now, what more could they wish for?"
)
return create_input()
class WSCPostprocess(Filter):
def __init__(self, **kwargs):
self.determiners = {
"a",
"an",
"few",
"her",
"his",
"each",
"every",
"many",
"much",
"my",
"our",
"some",
"that",
"the",
"their",
"these",
"this",
"those",
"which",
"whose",
"your",
}
def clean(self, s):
"""Ignore capitalization and determiners."""
s = s.strip().lower()
return " ".join([w for w in s.split(" ") if w not in self.determiners])
def apply(self, resps, docs):
filtered_resps = []
for prediction, reference in zip(*(resps, docs["span1_text"])):
prediction = self.clean(prediction[0])
reference = self.clean(reference)
if ("'" in prediction) != ("'" in reference):
# referent is "Bob's hat" as predicting the referent.
predicted_referent = False
else:
prediction_words = set(prediction.split(" "))
referent_words = set(reference.split(" "))
# Handle cases where the prediction is "fuzzy bunny" and the referent is
# "bunny".
predicted_referent = prediction_words.issubset(
referent_words
) or referent_words.issubset(prediction_words)
filtered_resps.append(predicted_referent)
return filtered_resps
# WSC273
### Paper
Title: `The Winograd Schema Challenge`
Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
A Winograd schema is a pair of sentences that differ in only one or two words
and that contain an ambiguity that is resolved in opposite ways in the two
sentences and requires the use of world knowledge and reasoning for its resolution.
The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.0
Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
### Citation
```
@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
title = "The winograd schema challenge",
abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
year = "2012",
language = "English (US)",
isbn = "9781577355601",
series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
}
```
### Groups and Tasks
#### Groups
* Not part of any group yet.
#### Tasks
* `wsc273`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: wsc273
dataset_path: winograd_wsc
dataset_name: wsc273
output_type: multiple_choice
test_split: test
doc_to_text: label
process_docs: !function utils.process_doc
doc_to_target: "{% set index = pronoun_loc + pronoun | length %}{{text[index:]}}"
doc_to_choice: "{% set template = text[:pronoun_loc] %}{{[template+options[0], template+options[1]]}}"
should_decontaminate: true
doc_to_decontamination_query: text
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
upper_pronouns = [
"A",
"An",
"The",
"She",
"He",
"It",
"They",
"My",
"His",
"Her",
"Their",
]
def process_doc(dataset):
def process_fn(doc):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = __normalize_option(doc, doc["options"][0])
doc["options"][1] = __normalize_option(doc, doc["options"][1])
return doc
return dataset.map(process_fn)
def __normalize_option(doc, option):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
if not start_of_sentence and pronoun in upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
......@@ -10,7 +10,7 @@ import collections
import importlib.util
import fnmatch
from typing import List, Literal, Union
from typing import Iterator, List, Literal, Union
import gc
import torch
......@@ -65,7 +65,7 @@ def join_iters(iters):
yield from iter
def chunks(iter, n=0, fn=None):
def chunks(iter, n: int = 0, fn=None):
arr = []
for i, x in enumerate(iter):
arr.append(x)
......@@ -87,11 +87,11 @@ def group(arr, fn):
class MultiChoice:
def __init__(self, choices):
def __init__(self, choices) -> None:
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
def __contains__(self, values) -> bool:
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.info(f"Available tasks to choose:")
......@@ -100,7 +100,7 @@ class MultiChoice:
raise ValueError("'{}' is not in task list".format(value))
return True
def __iter__(self):
def __iter__(self) -> Iterator:
for choice in self.choices:
yield choice
......@@ -108,7 +108,6 @@ class MultiChoice:
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
if type(patterns) == str:
patterns = [patterns]
......@@ -177,7 +176,7 @@ def make_disjoint_window(pair):
class Reorderer:
def __init__(self, arr, fn):
def __init__(self, arr, fn) -> None:
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
......@@ -212,7 +211,7 @@ class Grouper:
objects in `arr` satisfying `key == fn(ob)`.
"""
def __init__(self, arr, fn):
def __init__(self, arr, fn) -> None:
# self.orig_arr = arr
self.size = len(arr)
arr = list(enumerate(arr))
......@@ -263,7 +262,7 @@ class Grouper:
return res
def make_table(result_dict, column="results"):
def make_table(result_dict, column: str = "results"):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
......@@ -393,7 +392,6 @@ def get_git_commit_hash():
def import_function(loader, node):
function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name)
......@@ -428,7 +426,6 @@ def load_yaml_config(yaml_path):
include_path.reverse()
final_yaml_config = {}
for path in include_path:
# Assumes that path is a full path.
# If not found, assume the included yaml
# is in the same dir as the original yaml
......@@ -447,7 +444,7 @@ def load_yaml_config(yaml_path):
return yaml_config
def regex_replace(string, pattern, repl, count=0):
def regex_replace(string, pattern, repl, count: int = 0):
"""Implements the `re.sub` function as a custom Jinja filter."""
return re.sub(pattern, repl, string, count=count)
......@@ -521,7 +518,7 @@ def pad_and_concat(
return torch.cat(tensors, dim=0)
def clear_torch_cache():
def clear_torch_cache() -> None:
gc.collect()
torch.cuda.empty_cache()
......@@ -546,7 +543,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
tokenizer: transformers.PreTrainedTokenizer,
initial_decoder_input_length: int,
batch_size: int,
):
) -> None:
self.initial_decoder_input_length = initial_decoder_input_length
self.done_tracker = [False] * batch_size
self.sequence = sequence
......
......@@ -11,11 +11,12 @@ from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_task_folder
from lm_eval.benchmarks import include_benchmarks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def parse_args():
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
parser.add_argument(
......@@ -100,7 +101,7 @@ def parse_args():
return parser.parse_args()
def main():
def main() -> None:
args = parse_args()
if args.limit:
......
[mypy]
python_version = 3.9
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
warn_redundant_casts = True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors = True
[mypy-lm_eval.api.*]
ignore_errors = True
[mypy-lm_eval.prompts.*]
ignore_errors = True
[mypy-lm_eval.models.*]
ignore_errors = True
[mypy-scripts.*]
ignore_errors = True
[mypy-main]
ignore_errors = True
......@@ -53,7 +53,7 @@ setuptools.setup(
],
python_requires=">=3.9",
install_requires=[
"accelerate>=0.18.0",
"accelerate>=0.21.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment