Unverified Commit 4721379e authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge branch 'big-refactor' into mypy

parents a551c789 cc7828dd
import os
import sys
import json
import requests
import numpy as np
from lm_eval.logger import eval_logger
try:
API_KEY = os.environ["PERSPECTIVE_API_KEY"]
except KeyError:
API_KEY = None
eval_logger.warning(
"PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
)
url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
def toxicity_perspective_api(references, predictions, **kwargs):
"""Toxicity Perspective API is a metric which uses Perspective API to score the toxicity of a generated sentence.
......@@ -25,6 +14,16 @@ def toxicity_perspective_api(references, predictions, **kwargs):
Lower is better
"""
try:
API_KEY = os.environ["PERSPECTIVE_API_KEY"]
except KeyError:
eval_logger.warning(
"PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
)
raise
url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
scores = []
for pred in predictions:
data = {
......@@ -35,7 +34,7 @@ def toxicity_perspective_api(references, predictions, **kwargs):
headers = {
"content-type": "application/json",
}
req_response = requests.post(url, data=data, headers=headers)
req_response = requests.post(url, json=data, headers=headers)
if req_response.ok:
response = json.loads(req_response.text)
if (
......@@ -54,6 +53,6 @@ def toxicity_perspective_api(references, predictions, **kwargs):
raise SystemExit(0)
else:
eval_logger.error("Unhandled Exception")
raise SystemExit(0)
req_response.raise_for_status()
return np.mean(scores)
# SuperGLUE
### Paper
Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.
Homepage: https://super.gluebenchmark.com/
### Citation
```
@inproceedings{NEURIPS2019_4496bf24,
author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
volume = {32},
year = {2019}
}
```
### Groups and Tasks
#### Groups
* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
#### Tasks
Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
| T5V1.1 Base | SGLUE | BoolQ | CB | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
* `super-glue-lm-eval-v1`
- `boolq`
- `cb`
- `copa`
- `multirc`
- `record`
- `rte`
- `wic`
- `wsc`
* `super-glue-t5-prompt`
- `super_glue-boolq-t5-prompt`
- `super_glue-cb-t5-prompt`
- `super_glue-copa-t5-prompt`
- `super_glue-multirc-t5-prompt`
- `super_glue-record-t5-prompt`
- `super_glue-rte-t5-prompt`
- `super_glue-wic-t5-prompt`
- `super_glue-wsc-t5-prompt`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- super-glue-t5-prompt
task: super_glue-boolq-t5-prompt
dataset_path: super_glue
dataset_name: boolq
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "boolq passage: {{passage}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
......@@ -6,7 +6,7 @@ dataset_name: cb
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'contradiction', 'neutral']
metric_list:
......@@ -15,5 +15,6 @@ metric_list:
higher_is_better: true
ignore_case: true
ignore_punctuation: true
- metric: f1
aggregation: !function "aggregate.cb_multi_fi"
- metric: !function "t5_utils.mean_3class_f1"
aggregation: !function "t5_utils.agg_mean_3class_f1"
higher_is_better: true
import sklearn.metrics
def mean_3class_f1(predictions, references): # This is a passthrough function
string_label = ["entailment", "contradiction", "neutral"]
predictions = string_label.index(predictions[0])
references = string_label.index(references[0])
return (predictions, references)
def agg_mean_3class_f1(items):
predictions, references = zip(*items)
"""Computes the unweighted average of the F1 per class."""
metric_str = "fbeta_score"
metric_fn_kwargs = {
"beta": 1,
"labels": range(3),
"average": "macro",
}
def _fn(predictions, references):
metric_fn = getattr(sklearn.metrics, metric_str)
metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
return metric_val
return _fn(predictions, references)
......@@ -6,9 +6,9 @@ dataset_name: copa
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
doc_to_choice: ['choice1', 'choice2']
metric_list:
- metric: exact_match
aggregation: mean
......
group:
- super-glue-t5-prompt
task: super_glue-multirc-t5-prompt
dataset_path: super_glue
dataset_name: multirc
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
doc_to_target: label
doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.5
metric_list:
- metric: !function t5_utils.f1
aggregation: !function t5_utils.agg_f1
higher_is_better: true
- metric: !function t5_utils.em
aggregation: !function t5_utils.agg_em
higher_is_better: true
import collections
import numpy as np
import sklearn.metrics
def f1(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_reference = references[0].split("_")[-1]
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (prediction, reference)
def agg_f1(items):
predictions, references = zip(*items)
references, predictions = np.asarray(references), np.asarray(predictions)
return sklearn.metrics.f1_score(references, predictions)
def em(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_group, _reference = references[0].split("_")
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (_group, prediction, reference)
def agg_em(items):
grouped_values = collections.defaultdict(lambda: ([], []))
for group, prediction, reference in items:
grouped_values[group][0].append(reference)
grouped_values[group][1].append(prediction)
group_scores = []
for group, (targets, predictions) in grouped_values.items():
score = float(np.array_equal(targets, predictions))
group_scores.append(score)
return np.mean(group_scores)
......@@ -3,14 +3,15 @@ group:
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_name: record
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
doc_to_target: "{{answers}}"
process_docs: !function t5_utils.process_docs
doc_to_text: !function t5_utils.doc_to_text
doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
metric_list:
- metric: exact_match
aggregation: mean
- metric: !function t5_utils.em
aggregation: !function t5_utils.squad_em_agg
higher_is_better: true
- metric: !function t5_utils.f1
aggregation: !function t5_utils.squad_f1_agg
higher_is_better: true
ignore_case: true
ignore_punctuation: true
import re
import string
import collections
import numpy as np
from tqdm import tqdm
from datasets import Dataset, concatenate_datasets
from lm_eval.api.metrics import metric_max_over_ground_truths
def doc_to_text(doc):
passage = doc["passage"]
passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
passage = re.sub(r"\n@highlight\n", ". ", passage)
return " ".join(
[
"record query:",
doc["query"],
"entities:",
", ".join(doc["entities"]),
"passage:",
passage,
]
)
def process_docs(dataset):
def split_answers(doc):
split_doc = {
**{k: [] for k in doc.keys()},
}
answers = doc.pop("answers")
for idx, answer in enumerate(answers):
for key in split_doc.keys():
if key in doc:
split_doc[key].append(doc[key])
split_doc["answers"].append(answer)
return split_doc
dataset = dataset.map(split_answers)
new_dataset = {}
for key in dataset.features.keys():
new_dataset[key] = [x for row in dataset[key] for x in row]
return Dataset.from_dict(new_dataset)
def normalize_squad(answer):
"""Normalization used in official SQuAD evaluation script."""
def _normalize_answer(text, punc_chars, punc_repl):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(s):
return re.sub(r"\b(a|an|the)\b", " ", s)
def replace_punctuation(s):
to_replace = set(punc_chars)
return "".join(punc_repl if ch in to_replace else ch for ch in s)
def white_space_fix(s):
return " ".join(s.split())
text = text.lower()
text = replace_punctuation(text)
text = remove_articles(text)
text = white_space_fix(text)
return text
return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
def em(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def f1(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def squad_em_agg(items):
def _exact_match_score(prediction, target):
return target == prediction
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
# if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
em = []
for group in grouped_values.keys():
predictions, targets = grouped_values[group]
for p in predictions:
em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
return np.mean(em)
def squad_f1_agg(items):
def _f1_score(prediction, target):
"""Computes token f1 score for a single target and prediction."""
prediction_tokens = prediction.split()
target_tokens = target.split()
common = collections.Counter(prediction_tokens) & collections.Counter(
target_tokens
)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(target_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
f1 = []
for group in grouped_values.keys():
p, t = grouped_values[group]
f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
return np.mean(f1)
group:
- super-glue-t5-prompt
task: super_glue-rte-t5-prompt
dataset_path: super_glue
dataset_name: rte
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'not_entailment']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
group:
- super-glue-t5-prompt
task: super_glue-wic-t5-prompt
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
......@@ -2,7 +2,7 @@ group:
- super-glue-lm-eval-v1
task: wsc
dataset_path: super_glue
dataset_name: wsc
dataset_name: wsc.fixed
output_type: multiple_choice
training_split: train
validation_split: validation
......
import re
from lm_eval.utils import general_detokenize
def t5_prompt_doc_to_text(x):
def _mark_span(text, span_str, span_idx, mark):
pattern_tmpl = r"^((?:\S+\s){N})(W)"
pattern = re.sub("N", str(span_idx), pattern_tmpl)
pattern = re.sub("W", span_str, pattern)
return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
text = x["text"]
text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
# Compensate for 2 added "words" added in previous step.
span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
text = _mark_span(text, x["span2_text"], span2_index, "#")
return text
def default_doc_to_text(x):
raw_passage = x["text"]
# NOTE: HuggingFace span indices are word-based not character-based.
......
......@@ -2,16 +2,17 @@ group:
- super-glue-t5-prompt
task: super_glue-wsc-t5-prompt
dataset_path: super_glue
dataset_name: wsc
dataset_name: wsc.fixed
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
doc_to_text: !function "t5_utils.doc_to_text"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
- metric: accuracy
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
filter_list:
- name: "wsc_postprocessor"
filter:
- function: !function t5_utils.WSCPostprocess
import re
from lm_eval.api.filter import Filter
def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
return "wsc: " + text
def _wsc_inputs(x):
words = x["text"].split(" ")
# We would need some special logic to handle the case where the pronoun is the
# first or last word in the text. None of the examples in WSC seem to have
# this, so we are ignoring these cases.
assert x["span2_index"] > 0
assert x["span2_index"] < len(words)
pronoun_index = x["span2_index"]
def create_input():
assert words[pronoun_index] == x["span2_text"]
return " ".join(
[
" ".join(words[:pronoun_index]),
"X",
" ".join(words[pronoun_index + 1 :]),
]
)
# Handle some special cases.
if (
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
):
return (
"The boy continued to whip the pony , and eventually the pony threw "
'him over. John laughed out quite loud. "Good for X ," he said.'
)
# Using the span2_index, we get 'use' instead of 'it'.
if (
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
):
return (
"When they had eventually calmed down a bit , and had gotten home, "
"Mr. Farley put the magic pebble in an iron safe . Some day they might "
"want to use X , but really for now, what more could they wish for?"
)
return create_input()
class WSCPostprocess(Filter):
def __init__(self, **kwargs):
self.determiners = {
"a",
"an",
"few",
"her",
"his",
"each",
"every",
"many",
"much",
"my",
"our",
"some",
"that",
"the",
"their",
"these",
"this",
"those",
"which",
"whose",
"your",
}
def clean(self, s):
"""Ignore capitalization and determiners."""
s = s.strip().lower()
return " ".join([w for w in s.split(" ") if w not in self.determiners])
def apply(self, resps, docs):
filtered_resps = []
for prediction, reference in zip(*(resps, docs["span1_text"])):
prediction = self.clean(prediction[0])
reference = self.clean(reference)
if ("'" in prediction) != ("'" in reference):
# referent is "Bob's hat" as predicting the referent.
predicted_referent = False
else:
prediction_words = set(prediction.split(" "))
referent_words = set(reference.split(" "))
# Handle cases where the prediction is "fuzzy bunny" and the referent is
# "bunny".
predicted_referent = prediction_words.issubset(
referent_words
) or referent_words.issubset(prediction_words)
filtered_resps.append(predicted_referent)
return filtered_resps
# WMT16
### Paper
Title: `Findings of the 2016 Conference on Machine Translation`
Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301
Homepage: https://huggingface.co/datasets/wmt16
### Citation
```
@InProceedings{bojar-EtAl:2016:WMT1,
author = {Bojar, Ond
{r}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huck, Matthias and Jimeno Yepes, Antonio and Koehn, Philipp and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Neveol, Aurelie and Neves, Mariana and Popel, Martin and Post, Matt and Rubino, Raphael and Scarton, Carolina and Specia, Lucia and Turchi, Marco and Verspoor, Karin and Zampieri, Marcos},
title = {Findings of the 2016 Conference on Machine Translation},
booktitle = {Proceedings of the First Conference on Machine Translation},
month = {August},
year = {2016},
address = {Berlin, Germany},
publisher = {Association for Computational Linguistics},
pages = {131--198},
url = {http://www.aclweb.org/anthology/W/W16/W16-2301}
}
```
### Groups and Tasks
#### Groups
* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
#### Tasks
With specific prompt styles
* `wmt-ro-en-t5-prompt`: WMT16 with the prompt template used for T5
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import evaluate
def bleu(predictions, references):
return (predictions[0], references[0])
def agg_bleu(items):
bleu_fn = evaluate.load("bleu")
predictions, references = zip(*items)
return bleu_fn.compute(predictions=predictions, references=references)["bleu"]
group:
- wmt-t5-prompt
task: wmt-ro-en-t5-prompt
dataset_path: wmt16
dataset_name: ro-en
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "translate English to Romanian: {{translation.en}}"
doc_to_target: "{{translation.ro}}"
metric_list:
- metric: wer
aggregation: mean
higher_is_better: false
- metric: !function metrics.bleu
aggregation: !function metrics.agg_bleu
higher_is_better: true
# WSC273
### Paper
Title: `The Winograd Schema Challenge`
Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
A Winograd schema is a pair of sentences that differ in only one or two words
and that contain an ambiguity that is resolved in opposite ways in the two
sentences and requires the use of world knowledge and reasoning for its resolution.
The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.0
Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
### Citation
```
@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
title = "The winograd schema challenge",
abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
year = "2012",
language = "English (US)",
isbn = "9781577355601",
series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
}
```
### Groups and Tasks
#### Groups
* Not part of any group yet.
#### Tasks
* `wsc273`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment