Unverified Commit 21e1ed17 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #769 from EleutherAI/superglue

[Refactor] Superglue T5 Parity
parents 4cda3a1c b7082722
......@@ -2,6 +2,7 @@ from dataclasses import dataclass
from typing import List
from lm_eval.api.instance import Instance
from datasets import Dataset
class Filter:
......@@ -18,7 +19,7 @@ class Filter:
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
def apply(self, resps, docs):
"""
Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
......@@ -40,14 +41,14 @@ class FilterEnsemble:
name: str
filters: List[Filter]
def apply(self, instances: List[Instance]):
def apply(self, instances: List[Instance], docs: List[Dataset]):
resps = [
inst.resps for inst in instances
] # operate just on the model responses
for f in self.filters:
# apply filters in sequence
resps = f.apply(resps)
resps = f.apply(resps, docs)
# add the end results after filtering to filtered_requests of their respective source instances.
# has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
......
......@@ -627,19 +627,19 @@ class ConfigurableTask(Task):
)
if self.has_test_docs():
docs = self.test_docs()
self.task_docs = self.test_docs()
elif self.has_validation_docs():
docs = self.validation_docs()
self.task_docs = self.validation_docs()
else:
assert (
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc
self.features = list(docs.features.keys())
self.features = list(self.task_docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = docs[0]
test_doc = self.task_docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)
......@@ -743,6 +743,15 @@ class ConfigurableTask(Task):
)
return super().fewshot_docs()
def apply_filters(self):
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances, self.task_docs)
else:
eval_logger.warning("No filter defined, passing through instances")
return self._instances
def should_decontaminate(self):
return self._config.should_decontaminate
......
......@@ -17,14 +17,16 @@ FILTER_REGISTRY = {
def get_filter(filter_name):
return FILTER_REGISTRY[filter_name]
if filter_name in FILTER_REGISTRY:
return FILTER_REGISTRY[filter_name]
else:
return filter_name
def build_filter_ensemble(filter_name, components):
"""
Create a filtering pipeline.
"""
filters = []
for (function, kwargs) in components:
if kwargs is None:
......
......@@ -17,7 +17,7 @@ class DecontaminationFilter(Filter):
"""
self._decontam_results = None
def apply(self, reps):
def apply(self, reps, docs):
"""
Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
"""
......
......@@ -15,7 +15,7 @@ class RegexFilter(Filter):
self.regex = re.compile(regex_pattern)
self.fallback = fallback
def apply(self, resps):
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
......@@ -44,7 +44,7 @@ class WhitespaceFilter(Filter):
def __init__(self):
pass
def apply(self, resps):
def apply(self, resps, docs):
def filter_set(inst):
filtered_resp = []
......
......@@ -9,7 +9,7 @@ class TakeFirstFilter(Filter):
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
......@@ -23,7 +23,7 @@ class TakeKFilter(Filter):
super().__init__(*args, **kwargs)
def apply(self, resps):
def apply(self, resps, docs):
# check we have at least k responses per doc, else we can't take the first k
assert (
len(resps[0]) >= self.k
......@@ -37,7 +37,7 @@ class MajorityVoteFilter(Filter):
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
def apply(self, resps, docs):
"""
Each entry of `resps` is a list of model responses.
We select the response that occurs most frequently in each entry of `resps`.
......
import os
import torch
import transformers
from transformers.models.auto.modeling_auto import (
......@@ -67,6 +69,7 @@ class HFLM(LM):
revision: Optional[str] = "main",
subfolder: Optional[str] = None,
tokenizer: Optional[str] = None,
truncation: Optional[bool] = False,
max_length: Optional[int] = None,
device: Optional[str] = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
......@@ -75,6 +78,7 @@ class HFLM(LM):
low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
cache_dir: Optional[Union[str, os.PathLike]] = None,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
......@@ -240,6 +244,8 @@ class HFLM(LM):
use_fast=use_fast_tokenizer,
)
self.truncation = truncation
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
......@@ -419,7 +425,11 @@ class HFLM(LM):
return encoding
def tok_batch_encode(
self, strings: List[str], padding_side="left", left_truncate_len=None
self,
strings: List[str],
padding_side="left",
left_truncate_len=None,
truncation=False,
):
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
......@@ -432,6 +442,7 @@ class HFLM(LM):
encoding = self.tokenizer(
strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
add_special_tokens=add_special_tokens,
......@@ -856,7 +867,9 @@ class HFLM(LM):
# encode, pad, and truncate contexts for this batch
context_enc, attn_masks = self.tok_batch_encode(
contexts, left_truncate_len=max_ctx_len
contexts,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = context_enc.to(self.device)
attn_masks = attn_masks.to(self.device)
......
# SuperGLUE
### Paper
Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.
Homepage: https://super.gluebenchmark.com/
### Citation
```
@inproceedings{NEURIPS2019_4496bf24,
author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
volume = {32},
year = {2019}
}
```
### Groups and Tasks
#### Groups
* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
#### Tasks
Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
| T5V1.1 Base | SGLUE | BoolQ | CB | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
* `super-glue-lm-eval-v1`
- `boolq`
- `cb`
- `copa`
- `multirc`
- `record`
- `rte`
- `wic`
- `wsc`
* `super-glue-t5-prompt`
- `super_glue-boolq-t5-prompt`
- `super_glue-cb-t5-prompt`
- `super_glue-copa-t5-prompt`
- `super_glue-multirc-t5-prompt`
- `super_glue-record-t5-prompt`
- `super_glue-rte-t5-prompt`
- `super_glue-wic-t5-prompt`
- `super_glue-wsc-t5-prompt`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- super-glue-t5-prompt
task: super_glue-boolq-t5-prompt
dataset_path: super_glue
dataset_name: boolq
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "boolq passage: {{passage}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
......@@ -6,7 +6,7 @@ dataset_name: cb
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'contradiction', 'neutral']
metric_list:
......
import sklearn.metrics
def mean_3class_f1(predictions, references): # This is a passthrough function
string_label = ["entailment", "contradiction", "neutral"]
predictions = string_label.index(predictions[0])
references = string_label.index(references[0])
return (predictions, references)
def agg_mean_3class_f1(items):
predictions, references = zip(*items)
"""Computes the unweighted average of the F1 per class."""
metric_str = "fbeta_score"
metric_fn_kwargs = {
"beta": 1,
"labels": range(3),
"average": "macro",
}
def _fn(predictions, references):
metric_fn = getattr(sklearn.metrics, metric_str)
metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
return metric_val
return _fn(predictions, references)
......@@ -6,9 +6,9 @@ dataset_name: copa
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} question: {{question}}"
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
doc_to_choice: ['choice1', 'choice2']
metric_list:
- metric: exact_match
aggregation: mean
......
group:
- super-glue-t5-prompt
task: super_glue-multirc-t5-prompt
dataset_path: super_glue
dataset_name: multirc
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
doc_to_target: label
doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.5
metric_list:
- metric: !function t5_utils.f1
aggregation: !function t5_utils.agg_f1
higher_is_better: true
- metric: !function t5_utils.em
aggregation: !function t5_utils.agg_em
higher_is_better: true
import collections
import numpy as np
import sklearn.metrics
def f1(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_reference = references[0].split("_")[-1]
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (prediction, reference)
def agg_f1(items):
predictions, references = zip(*items)
references, predictions = np.asarray(references), np.asarray(predictions)
return sklearn.metrics.f1_score(references, predictions)
def em(predictions, references): # This is a passthrough function
_prediction = predictions[0]
_group, _reference = references[0].split("_")
string_label = ["False", "True"]
reference = string_label.index(_reference)
prediction = (
string_label.index(_prediction)
if _prediction in string_label
else not bool(reference)
)
return (_group, prediction, reference)
def agg_em(items):
grouped_values = collections.defaultdict(lambda: ([], []))
for group, prediction, reference in items:
grouped_values[group][0].append(reference)
grouped_values[group][1].append(prediction)
group_scores = []
for group, (targets, predictions) in grouped_values.items():
score = float(np.array_equal(targets, predictions))
group_scores.append(score)
return np.mean(group_scores)
......@@ -3,14 +3,15 @@ group:
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_name: record
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
doc_to_target: "{{answers}}"
process_docs: !function t5_utils.process_docs
doc_to_text: !function t5_utils.doc_to_text
doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"
metric_list:
- metric: exact_match
aggregation: mean
- metric: !function t5_utils.em
aggregation: !function t5_utils.squad_em_agg
higher_is_better: true
- metric: !function t5_utils.f1
aggregation: !function t5_utils.squad_f1_agg
higher_is_better: true
ignore_case: true
ignore_punctuation: true
import re
import string
import collections
import numpy as np
from tqdm import tqdm
from datasets import Dataset, concatenate_datasets
from lm_eval.api.metrics import metric_max_over_ground_truths
def doc_to_text(doc):
passage = doc["passage"]
passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
passage = re.sub(r"\n@highlight\n", ". ", passage)
return " ".join(
[
"record query:",
doc["query"],
"entities:",
", ".join(doc["entities"]),
"passage:",
passage,
]
)
def process_docs(dataset):
def split_answers(doc):
split_doc = {
**{k: [] for k in doc.keys()},
}
answers = doc.pop("answers")
for idx, answer in enumerate(answers):
for key in split_doc.keys():
if key in doc:
split_doc[key].append(doc[key])
split_doc["answers"].append(answer)
return split_doc
dataset = dataset.map(split_answers)
new_dataset = {}
for key in dataset.features.keys():
new_dataset[key] = [x for row in dataset[key] for x in row]
return Dataset.from_dict(new_dataset)
def normalize_squad(answer):
"""Normalization used in official SQuAD evaluation script."""
def _normalize_answer(text, punc_chars, punc_repl):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(s):
return re.sub(r"\b(a|an|the)\b", " ", s)
def replace_punctuation(s):
to_replace = set(punc_chars)
return "".join(punc_repl if ch in to_replace else ch for ch in s)
def white_space_fix(s):
return " ".join(s.split())
text = text.lower()
text = replace_punctuation(text)
text = remove_articles(text)
text = white_space_fix(text)
return text
return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
def em(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def f1(predictions, references): # This is a passthrough function
return (predictions[0], references[0])
def squad_em_agg(items):
def _exact_match_score(prediction, target):
return target == prediction
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
# if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
em = []
for group in grouped_values.keys():
predictions, targets = grouped_values[group]
for p in predictions:
em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
return np.mean(em)
def squad_f1_agg(items):
def _f1_score(prediction, target):
"""Computes token f1 score for a single target and prediction."""
prediction_tokens = prediction.split()
target_tokens = target.split()
common = collections.Counter(prediction_tokens) & collections.Counter(
target_tokens
)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(target_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
grouped_values = collections.defaultdict(lambda: ([], []))
for prediction, reference in items:
group, reference = reference.split("_")
if group not in grouped_values:
grouped_values[group][0].append(normalize_squad(prediction))
grouped_values[group][1].append(normalize_squad(reference))
f1 = []
for group in grouped_values.keys():
p, t = grouped_values[group]
f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
return np.mean(f1)
group:
- super-glue-t5-prompt
task: super_glue-rte-t5-prompt
dataset_path: super_glue
dataset_name: rte
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'not_entailment']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
group:
- super-glue-t5-prompt
task: super_glue-wic-t5-prompt
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
......@@ -2,7 +2,7 @@ group:
- super-glue-lm-eval-v1
task: wsc
dataset_path: super_glue
dataset_name: wsc
dataset_name: wsc.fixed
output_type: multiple_choice
training_split: train
validation_split: validation
......
import re
from lm_eval.utils import general_detokenize
def t5_prompt_doc_to_text(x):
def _mark_span(text, span_str, span_idx, mark):
pattern_tmpl = r"^((?:\S+\s){N})(W)"
pattern = re.sub("N", str(span_idx), pattern_tmpl)
pattern = re.sub("W", span_str, pattern)
return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
text = x["text"]
text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
# Compensate for 2 added "words" added in previous step.
span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
text = _mark_span(text, x["span2_text"], span2_index, "#")
return text
def default_doc_to_text(x):
raw_passage = x["text"]
# NOTE: HuggingFace span indices are word-based not character-based.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment