Unverified Commit 3b4fa26e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'big-refactor' into wmt

parents d01cc479 8f448eed
...@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM): ...@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM):
engine: str = "text-davinci-003", engine: str = "text-davinci-003",
truncate: bool = False, truncate: bool = False,
batch_size: int = 1, batch_size: int = 1,
): ) -> None:
""" """
:param engine: str :param engine: str
...@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM): ...@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM):
return self.end_of_text_token_id return self.end_of_text_token_id
@property @property
def max_length(self): def max_length(self) -> int:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048 return 2048
@property @property
def max_gen_toks(self): def max_gen_toks(self) -> int:
return 256 return 256
@property @property
...@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM): ...@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM):
return self._loglikelihood_tokens(new_reqs) return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens( def _loglikelihood_tokens(
self, requests, disable_tqdm=False self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]: ) -> List[Tuple[float, bool]]:
res = [] res = []
......
...@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs): ...@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs):
@register_model("textsynth") @register_model("textsynth")
class TextSynthLM(LM): class TextSynthLM(LM):
def __init__(self, engine, truncate=False): def __init__(self, engine, truncate: bool = False) -> None:
""" """
:param engine: str :param engine: str
TextSynth API engine (e.g. `gptj_6B`) TextSynth API engine (e.g. `gptj_6B`)
...@@ -62,12 +62,12 @@ class TextSynthLM(LM): ...@@ -62,12 +62,12 @@ class TextSynthLM(LM):
raise NotImplementedError() raise NotImplementedError()
@property @property
def max_length(self): def max_length(self) -> int:
# NOTE: Turn on truncation to avoid errors on long inputs. # NOTE: Turn on truncation to avoid errors on long inputs.
return 2048 return 2048
@property @property
def max_gen_toks(self): def max_gen_toks(self) -> int:
return 256 return 256
@property @property
......
...@@ -5,7 +5,7 @@ from lm_eval.logger import eval_logger ...@@ -5,7 +5,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels: # Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name. # prompt category name, and prompt name.
# This allows us to access prompts # This allows us to access prompts
PROMPT_REGISTRY = { PROMPT_REGISTRY: dict[str, dict[str, str]] = {
"qa-basic": { "qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:", "question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:", "q-newline-a": "Q: {{question}}\nA:",
...@@ -13,7 +13,7 @@ PROMPT_REGISTRY = { ...@@ -13,7 +13,7 @@ PROMPT_REGISTRY = {
} }
def get_prompt(prompt_id: str, dataset_name=None, subset_name=None): def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
# unpack prompt name # unpack prompt name
category_name, prompt_name = prompt_id.split(":") category_name, prompt_name = prompt_id.split(":")
if subset_name is None: if subset_name is None:
......
...@@ -5,8 +5,8 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -5,8 +5,8 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] Glue - [x] Glue
- [x] SuperGlue - [x] SuperGlue
- [ ] CoQA (Lintang) - [x] CoQA
- [ ] DROP (Lintang) - [x] DROP
- [x] ~~Lambada~~ - [x] ~~Lambada~~
- [x] Lambada (Cloze variants) - [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~ - [x] ~~Lambada (Multilingual)~~
...@@ -29,7 +29,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -29,7 +29,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] HeadQA - [x] HeadQA
- [x] MathQA - [x] MathQA
- [x] WebQs - [x] WebQs
- [ ] WSC273 (Lintang) - [x] WSC273
- [x] Winogrande - [x] Winogrande
- [x] ANLI - [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info) - [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
...@@ -38,7 +38,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -38,7 +38,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] TruthfulQA (gen) - [x] TruthfulQA (gen)
- [ ] MuTual - [ ] MuTual
- [ ] Hendrycks Math (Hailey) - [ ] Hendrycks Math (Hailey)
- [ ] Asdiv - [x] Asdiv
- [ ] GSM8k - [ ] GSM8k
- [x] Arithmetic - [x] Arithmetic
- [ ] MMMLU (Hailey) - [ ] MMMLU (Hailey)
......
...@@ -15,7 +15,7 @@ from lm_eval.api.registry import ( ...@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
) )
def register_configurable_task(config): def register_configurable_task(config: dict[str, str]) -> int:
SubClass = type( SubClass = type(
config["task"] + "ConfigurableTask", config["task"] + "ConfigurableTask",
(ConfigurableTask,), (ConfigurableTask,),
...@@ -38,7 +38,7 @@ def register_configurable_task(config): ...@@ -38,7 +38,7 @@ def register_configurable_task(config):
return 0 return 0
def check_prompt_config(config): def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
all_configs = [] all_configs = []
if "use_prompt" in config: if "use_prompt" in config:
prompt_list = prompts.load_prompt_list( prompt_list = prompts.load_prompt_list(
...@@ -69,14 +69,14 @@ def check_prompt_config(config): ...@@ -69,14 +69,14 @@ def check_prompt_config(config):
return all_configs return all_configs
def get_task_name_from_config(task_config): def get_task_name_from_config(task_config: dict[str, str]) -> str:
if "dataset_name" in task_config: if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config) return "{dataset_path}_{dataset_name}".format(**task_config)
else: else:
return "{dataset_path}".format(**task_config) return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir): def include_task_folder(task_dir: str) -> None:
""" """
Calling this function Calling this function
""" """
......
task: asdiv
dataset_path: EleutherAI/asdiv
output_type: loglikelihood
validation_split: validation
doc_to_text: "{{body}}\nQuestion:{{question}}\nAnswer:"
doc_to_target: "{{answer.split(' (')[0]}}"
should_decontaminate: true
doc_to_decontamination_query: "{{body}} {{question}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
# CoQA
### Paper
Title: `CoQA: A Conversational Question Answering Challenge`
Abstract: https://arxiv.org/pdf/1808.07042.pdf
CoQA is a large-scale dataset for building Conversational Question Answering
systems. The goal of the CoQA challenge is to measure the ability of machines to
understand a text passage and answer a series of interconnected questions that
appear in a conversation.
Homepage: https://stanfordnlp.github.io/coqa/
### Citation
```
BibTeX-formatted citation goes here
```
### Groups and Tasks
#### Groups
* Not part of a group yet
#### Tasks
* `coqa`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: coqa
dataset_path: EleutherAI/coqa
output_type: greedy_until
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
process_results: !function utils.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{story}} {{question.input_text|join('\n')}}"
generation_kwargs:
until:
- "\nQ:"
metric_list:
- metric: em
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
from itertools import zip_longest
import transformers.data.metrics.squad_metrics as squad_metrics
def doc_to_text(doc):
# Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# and a question qi, the task is to predict the answer ai
doc_text = doc["story"] + "\n\n"
for (q, a) in zip_longest(
doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
): # omit target answer ai
question = f"Q: {q}\n\n"
answer = f"A: {a}\n\n" if a is not None else "A:"
doc_text += question + answer
return doc_text
def doc_to_target(doc):
turn_id = len(doc["questions"]["input_text"])
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
answers = []
answer_forturn = doc["answers"]["input_text"][turn_id - 1]
answers.append(answer_forturn)
additional_answers = doc.get("additional_answers")
if additional_answers:
for key in additional_answers:
additional_answer_for_turn = additional_answers[key]["input_text"][
turn_id - 1
]
if additional_answer_for_turn.lower() not in map(str.lower, answers):
answers.append(additional_answer_for_turn)
return answers
def em(gold_list, pred):
# tests for exact match and on the normalised answer (compute_exact)
em_sum = 0.0
if len(gold_list) > 1:
for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :]
# predictions compared against (n) golds and take maximum
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
return em_sum / max(1, len(gold_list))
def compute_scores(gold_list, pred):
# tests for exact match and on the normalised answer (compute_exact)
# test for overlap (compute_f1)
f1_sum = 0.0
em_sum = 0.0
if len(gold_list) > 1:
for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :]
# predictions compared against (n) golds and take maximum
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
return {
"em": em_sum / max(1, len(gold_list)),
"f1": f1_sum / max(1, len(gold_list)),
}
def process_results(doc, results):
gold_list = doc_to_target(doc)
pred = results[0].strip().split("\n")[0]
scores = compute_scores(gold_list, pred)
return scores
# DROP
### Paper
Title: `DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs`
Abstract: https://aclanthology.org/attachments/N19-1246.Supplementary.pdf
DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
this crowdsourced, adversarially-created, 96k question-answering benchmark, a
system must resolve multiple references in a question, map them onto a paragraph,
and perform discrete operations over them (such as addition, counting, or sorting).
Homepage: https://allenai.org/data/drop
Acknowledgement: This implementation is based on the official evaluation for `DROP`:
https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
### Citation
```
@misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `drop`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: drop
dataset_path: EleutherAI/drop
output_type: greedy_until
training_split: train
validation_split: validation
process_docs: !function utils.process_docs
doc_to_text: "{{passage}} {{question}}"
doc_to_target: "{{ answer|join(',')}}"
target_delimiter: ""
process_results: !function utils.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{passage}} {{question}}"
generation_kwargs:
until:
- "."
metric_list:
- metric: em
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
import re
import string
import numpy as np
from scipy.optimize import linear_sum_assignment
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
def process_docs(dataset):
def _process(doc):
return {
"id": doc["query_id"],
"passage": doc["passage"],
"question": doc["question"],
"answers": get_answers(doc),
}
return dataset.map(_process)
def get_answers(doc):
def _flatten_validated_answers(validated_answers):
"""Flattens a dict of lists of validated answers.
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
valid_answers = []
for i in range(len(validated_answers["number"])):
valid_answers.append(
{
"number": validated_answers["number"][i],
"date": validated_answers["date"][i],
"spans": validated_answers["spans"][i],
}
)
return valid_answers
answers = []
answers_set = set()
candidates = [doc["answer"]] + _flatten_validated_answers(doc["validated_answers"])
for candidate in candidates:
answer = parse_answer(candidate)
if answer in answers_set:
continue
answers_set.add(answer)
answers.append(answer)
return answers
def parse_answer(answer):
# NOTE: Everything is returned as a tuple for uniformity and hashability.
if answer["number"] != "":
return (str(answer["number"]),)
if answer["spans"] != []:
return tuple(answer["spans"])
return (
" ".join(
[answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
).strip(),
)
def process_results(doc, results):
preds, golds = results, doc["answers"]
max_em = 0
max_f1 = 0
for gold_answer in golds:
exact_match, f1_score = get_metrics(preds, gold_answer)
if gold_answer[0].strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
return {"em": max_em, "f1": max_f1}
def get_metrics(predicted, gold):
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
gold_bags[0]
):
exact_match = 1.0
else:
exact_match = 0.0
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return exact_match, f1
def _answer_to_bags(answer):
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans = []
token_bags = []
for raw_span in raw_spans:
normalized_span = _normalize(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(predicted, gold):
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if _match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def _compute_f1(predicted_bag, gold_bag):
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _match_numbers_if_present(gold_bag, predicted_bag):
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
if _is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if _is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True
return False
def _is_number(text):
try:
float(text)
return True
except ValueError:
return False
def _remove_articles(text):
return _ARTICLES.sub(" ", text)
def _white_space_fix(text):
return " ".join(text.split())
def _remove_punc(text):
exclude = set(string.punctuation)
if not _is_number(text):
return "".join(ch for ch in text if ch not in exclude)
else:
return text
def _fix_number(text):
return str(float(text)) if _is_number(text) else text
def _tokenize(text):
return re.split(" |-", text)
def _normalize(answer):
tokens = [
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower()))))
for token in _tokenize(answer)
]
tokens = [token for token in tokens if token.strip()]
normalized = " ".join(tokens).strip()
return normalized
def doc_to_text(doc): def doc_to_text(doc) -> str:
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format( return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["premise"], doc["premise"],
doc["hypothesis"].strip() doc["hypothesis"].strip()
......
...@@ -15,7 +15,7 @@ def _preproc_doc(doc): ...@@ -15,7 +15,7 @@ def _preproc_doc(doc):
return doc return doc
def doc_to_text(doc): def doc_to_text(doc) -> str:
doc = _preproc_doc(doc) doc = _preproc_doc(doc)
return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:" return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"
......
def doc_to_text(doc): def doc_to_text(doc) -> str:
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, doc["question"], doc["final_decision"] ctxs, doc["question"], doc["final_decision"]
) )
def doc_to_target(doc): def doc_to_target(doc) -> str:
return " {}".format(doc["final_decision"]) return " {}".format(doc["final_decision"])
......
import os import os
import sys
import json import json
import requests import requests
import numpy as np import numpy as np
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
try:
API_KEY = os.environ["PERSPECTIVE_API_KEY"]
except KeyError:
API_KEY = None
eval_logger.warning(
"PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
)
url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
def toxicity_perspective_api(references, predictions, **kwargs): def toxicity_perspective_api(references, predictions, **kwargs):
"""Toxicity Perspective API is a metric which uses Perspective API to score the toxicity of a generated sentence. """Toxicity Perspective API is a metric which uses Perspective API to score the toxicity of a generated sentence.
...@@ -25,6 +14,16 @@ def toxicity_perspective_api(references, predictions, **kwargs): ...@@ -25,6 +14,16 @@ def toxicity_perspective_api(references, predictions, **kwargs):
Lower is better Lower is better
""" """
try:
API_KEY = os.environ["PERSPECTIVE_API_KEY"]
except KeyError:
eval_logger.warning(
"PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
)
raise
url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
scores = [] scores = []
for pred in predictions: for pred in predictions:
data = { data = {
...@@ -35,7 +34,7 @@ def toxicity_perspective_api(references, predictions, **kwargs): ...@@ -35,7 +34,7 @@ def toxicity_perspective_api(references, predictions, **kwargs):
headers = { headers = {
"content-type": "application/json", "content-type": "application/json",
} }
req_response = requests.post(url, data=data, headers=headers) req_response = requests.post(url, json=data, headers=headers)
if req_response.ok: if req_response.ok:
response = json.loads(req_response.text) response = json.loads(req_response.text)
if ( if (
...@@ -54,6 +53,6 @@ def toxicity_perspective_api(references, predictions, **kwargs): ...@@ -54,6 +53,6 @@ def toxicity_perspective_api(references, predictions, **kwargs):
raise SystemExit(0) raise SystemExit(0)
else: else:
eval_logger.error("Unhandled Exception") eval_logger.error("Unhandled Exception")
raise SystemExit(0) req_response.raise_for_status()
return np.mean(scores) return np.mean(scores)
# SuperGLUE
### Paper
Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.
Homepage: https://super.gluebenchmark.com/
### Citation
```
@inproceedings{NEURIPS2019_4496bf24,
author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
volume = {32},
year = {2019}
}
```
### Groups and Tasks
#### Groups
* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
#### Tasks
Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
| T5V1.1 Base | SGLUE | BoolQ | CB | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
* `super-glue-lm-eval-v1`
- `boolq`
- `cb`
- `copa`
- `multirc`
- `record`
- `rte`
- `wic`
- `wsc`
* `super-glue-t5-prompt`
- `super_glue-boolq-t5-prompt`
- `super_glue-cb-t5-prompt`
- `super_glue-copa-t5-prompt`
- `super_glue-multirc-t5-prompt`
- `super_glue-record-t5-prompt`
- `super_glue-rte-t5-prompt`
- `super_glue-wic-t5-prompt`
- `super_glue-wsc-t5-prompt`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- super-glue-t5-prompt
task: super_glue-boolq-t5-prompt
dataset_path: super_glue
dataset_name: boolq
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "boolq passage: {{passage}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
...@@ -6,7 +6,7 @@ dataset_name: cb ...@@ -6,7 +6,7 @@ dataset_name: cb
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: greedy_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}" doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label doc_to_target: label
doc_to_choice: ['entailment', 'contradiction', 'neutral'] doc_to_choice: ['entailment', 'contradiction', 'neutral']
metric_list: metric_list:
......
import sklearn.metrics
def mean_3class_f1(predictions, references): # This is a passthrough function
string_label = ["entailment", "contradiction", "neutral"]
predictions = string_label.index(predictions[0])
references = string_label.index(references[0])
return (predictions, references)
def agg_mean_3class_f1(items):
predictions, references = zip(*items)
"""Computes the unweighted average of the F1 per class."""
metric_str = "fbeta_score"
metric_fn_kwargs = {
"beta": 1,
"labels": range(3),
"average": "macro",
}
def _fn(predictions, references):
metric_fn = getattr(sklearn.metrics, metric_str)
metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
return metric_val
return _fn(predictions, references)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment