"web/vscode:/vscode.git/clone" did not exist on "a3a713b6c581f4c0487c58c5a20eca2a5e8e6bde"
Unverified Commit 3d432b1a authored by Charles Foster's avatar Charles Foster Committed by GitHub
Browse files

Merge pull request #4 from EleutherAI/master

Update cfoster0 fork
parents 4a294d8a 4d8ed7d5
import re
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from ..base import rf, mean
from sklearn.metrics import f1_score, matthews_corrcoef from . common import HFTask
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class HellaSwag(HFTask): class HellaSwag(HFTask):
DATASET_PATH = "hellaswag" DATASET_PATH = "hellaswag"
DATASET_NAME = None DATASET_NAME = None
@classmethod
def remove_brackets(cls, text):
""" Removes brackets from HellaSwag documents.
NOTE: The brackets are artifacts of the WikiHow dataset portion underlying
HellaSwag.
"""
text = re.sub('\[.*?\]', '', text)
return text
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -30,25 +39,78 @@ class HellaSwag(HFTask): ...@@ -30,25 +39,78 @@ class HellaSwag(HFTask):
return self.data["test"] return self.data["test"]
def fewshot_description(self): def fewshot_description(self):
return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation." return "Label for the relevant action: Sentences describing the " \
"context, with an incomplete sentence trailing\nanswer that " \
"plausibly completes the situation."
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
text = doc['activity_label'] + ': ' + doc['ctx'] + '\n' text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
if include_target: return self.remove_brackets(text)
letter_answer = doc['label']
if letter_answer == '0': def doc_to_target(self, doc):
index = 0 letter_answer = doc['label']
elif letter_answer == '1': if letter_answer == '0':
index = 1 index = 0
elif letter_answer == '2': elif letter_answer == '1':
index = 2 index = 1
elif letter_answer == '3': elif letter_answer == '2':
index = 3 index = 2
else: elif letter_answer == '3':
raise ValueError("HellaSwag from HF datasets contained an invalid answer key") index = 3
text += doc['endings'][index] else:
return text raise ValueError(
"HellaSwag from HF datasets contained an invalid answer key")
target = doc['endings'][index]
return " " + self.remove_brackets(target)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_answers = []
for i in range(4):
continuation = " " + self.remove_brackets(doc['endings'][i])
ll_answers.append(rf.loglikelihood(ctx, continuation))
return ll_answers
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = int(doc['label'])
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def evaluate(self, docs, lm, provide_description, num_fewshot): def higher_is_better(self):
# TODO: Write evaluation function """
raise NotImplementedError() :returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
from lm_eval.base import Dataset from lm_eval.base import Task, rf, mean, perplexity
from lm_eval.utils import sh from lm_eval.utils import sh
import json import json
import requests import math
import ftfy from best_download import download_file
class Lambada(Dataset): class LAMBADA(Task):
def __init__(self):
self.download()
def download(self): def download(self):
sh("mkdir -p data/lambada") sh("mkdir -p data/lambada")
with open("data/lambada/lambada_test.json", 'w') as f: download_file(
req = requests.get("https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl") "https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl",
req.raise_for_status() "data/lambada/lambada_test.jsonl",
jsons = [json.loads(l) for l in req.iter_lines()] "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
texts = [ftfy.fix_text(j['text'], normalization='NFKC') for j in jsons] )
json.dump(texts, f)
def has_training_docs(self): def has_training_docs(self):
return False return False
def has_validation_docs(self): def has_validation_docs(self):
return False return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def training_docs(self): def training_docs(self):
pass pass
def validation_docs(self): def validation_docs(self):
with open("data/lambada/lambada_test.jsonl") as fh:
for line in fh:
yield json.loads(line)
def test_docs(self):
pass pass
def load_doc(self, myjson): def doc_to_text(self, doc):
return [doc for doc in myjson] return doc['text'].rsplit(' ', 1)[0]
def test_docs(self): def doc_to_target(self, doc):
myjson = json.load(open("data/lambada/lambada_test.json")) return " " + doc['text'].rsplit(' ', 1)[1]
return self.load_doc(myjson)
def fewshot_description(self):
# TODO: figure out description
return ""
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
def process_results(self, doc, results):
ll, is_greedy = results
def doc_to_text(self, doc, include_target=True): return {
#TODO: check if this is how OA does it 'ppl': ll,
#label = doc[] 'acc': int(is_greedy)
return doc }
def aggregation(self):
return {
'ppl': perplexity,
'acc': mean
}
def evaluate(self, docs, lm, provide_description, num_fewshot): def higher_is_better(self):
pass return {
\ No newline at end of file 'ppl': False,
'acc': True
}
from . common import HFTask from . common import HFTask
from itertools import islice
import random
class NaturalQs(HFTask): class NaturalQs(HFTask):
# TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set.
DATASET_PATH = "natural_questions" DATASET_PATH = "natural_questions"
DATASET_NAME = None DATASET_NAME = None
...@@ -22,24 +28,68 @@ class NaturalQs(HFTask): ...@@ -22,24 +28,68 @@ class NaturalQs(HFTask):
# Data is too large to fit in memory. # Data is too large to fit in memory.
return self.data["train"] return self.data["train"]
def doc_to_text(self, doc, include_target=True): def fewshot_examples(self, k):
question = doc['question']['text'] # Data is too large to fit in memory. We just sample from the first bit.
if self._training_docs is None:
text = 'Q: ' + question + '\n\n' + 'A: ' self._training_docs = list(islice(self.training_docs(), 0, 100000))
if include_target: return random.sample(self._training_docs, k)
# There's a short answer and a long answer. Based on the paper, I'm using the long answer.
short_answer = doc['annotations']['short_answers'][0]['text'] def doc_to_text(self, doc):
long_answer_start = doc['annotations']['long_answer'][0]['start_token'] return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
long_answer_end = doc['annotations']['long_answer'][0]['end_token']
long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end] def doc_to_target(self, doc):
long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end] # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html] short_answer = doc['annotations']['short_answers'][0]['text']
long_answer = " ".join(long_answer_chars) long_answer_start = doc['annotations']['long_answer'][0]['start_token']
text += long_answer # Replace with short_answer[0] for short answer long_answer_end = doc['annotations']['long_answer'][0]['end_token']
long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
return text long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
def evaluate(self, docs, lm, provide_description, num_fewshot): long_answer = " ".join(long_answer_chars)
# TODO: implement return long_answer # Replace with short_answer[0] for short answer
raise NotImplementedError()
\ No newline at end of file def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
import numpy as np from lm_eval.base import MultipleChoiceTask
from scipy.stats import pearsonr, spearmanr from .common import HFTask
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class OpenBookQA(HFTask):
class OpenBookQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
...@@ -17,40 +15,34 @@ class OpenBookQA(HFTask): ...@@ -17,40 +15,34 @@ class OpenBookQA(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc):
out_doc = {
"id": doc["id"],
"query": doc["question_stem"],
"choices": doc["choices"]["text"],
"gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self): def training_docs(self):
if self.has_training_docs(): docs = super().training_docs()
if self._training_docs is None: return self._load_docs(docs)
self._training_docs = list(self.data["train"])
return self._training_docs
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): docs = super().validation_docs()
return self.data["validation"] return self._load_docs(docs)
def test_docs(self): def test_docs(self):
if self.has_test_docs(): docs = super().test_docs()
return self.data["test"] return self._load_docs(docs)
def fewshot_description(self): def fewshot_description(self):
return "Text of the question prompt\nText of the answer completion" # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc, include_target=True):
text = doc['question_stem'] + '\n' def doc_to_text(self, doc):
if include_target: return doc["query"]
letter_answer = doc['answerKey']
if letter_answer == 'A':
index = 0
elif letter_answer == 'B':
index = 1
elif letter_answer == 'C':
index = 2
elif letter_answer == 'D':
index = 3
else:
raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
text += doc['choices']['text'][index] + '.'
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
import json import numpy as np
import random from lm_eval.base import rf, mean
from lm_eval.base import Dataset from . common import HFTask
from ..utils import sh
class PiQA(Dataset): class PiQA(HFTask):
def __init__(self): DATASET_PATH = "piqa"
self.download() DATASET_NAME = None
def download(self):
#pass
#TODO: don't download if files already there
sh("""
mkdir -p data/piqa
wget https://yonatanbisk.com/piqa/data/train.jsonl -O data/piqa/piqa-train.jsonl
wget https://yonatanbisk.com/piqa/data/train-labels.lst -O data/piqa/piqa-train-labels.lst
wget https://yonatanbisk.com/piqa/data/valid.jsonl -O data/piqa/piqa-valid.jsonl
wget https://yonatanbisk.com/piqa/data/valid-labels.lst -O data/piqa/piqa-valid-labels.lst
wget https://yonatanbisk.com/piqa/data/tests.jsonl -O data/piqa/piqa-test.jsonl
""")
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -25,33 +14,35 @@ class PiQA(Dataset): ...@@ -25,33 +14,35 @@ class PiQA(Dataset):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def load_docs(self, textfilename, labelfilename):
if labelfilename != None:
return zip([json.loads(entry) for entry in list(open(textfilename,'r'))],list(open(labelfilename, 'r')))
else:
return [json.loads(entry) for entry in list(open(textfilename,'r'))]
def training_docs(self):
return self.load_docs('data/piqa/piqa-train.jsonl', 'data/piqa/piqa-train-labels.lst')
def validation_docs(self):
return self.load_docs('data/piqa/piqa-valid.jsonl', 'data/piqa/piqa-valid-labels.lst')
def test_docs(self):
return self.load_docs('data/piqa/piqa-test.jsonl', None)
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc, include_target=True):
if include_target: def doc_to_text(self, doc):
rightanswer = int(doc[1][0])+1 return "Question: "+doc["goal"] + "\nAnswer:"
return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
#TODO: check if oa uses newline def doc_to_target(self, doc):
return doc['goal'] + ' ' solutions = [doc["sol1"], doc["sol2"]]
return " " + solutions[doc["label"]]
def evaluate(self, docs, lm):
pass def construct_requests(self, doc, ctx):
ll_1, _ = rf.loglikelihood(ctx, " " + doc['sol1'])
ll_2, _ = rf.loglikelihood(ctx, " " + doc['sol2'])
return ll_1, ll_2
def process_results(self, doc, results):
return {
'acc': np.argmax(results) == doc["label"]
}
def aggregation(self):
return {
'acc': mean
}
def higher_is_better(self):
return {
'acc': True
}
import numpy as np
import json
import random
from .common import HFTask
from lm_eval.base import rf, mean
class Pubmed_QA(HFTask):
DATASET_PATH = "pubmed_qa"
DATASET_NAME = "pqa_labeled"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def test_docs(self):
if self.has_test_docs():
# HF is labelled as train but its really just for testing
return self.data["train"]
def fewshot_description(self):
# Average ctx length in labelled dataset is 238.9
# 2 few-shot exmamples pushes it beyond context window
return ""
def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs,
doc["question"],
doc["final_decision"]
)
def doc_to_target(self, doc):
return " {}".format(doc["final_decision"])
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns
an iterable of Requests which will be sent to the LM.
"""
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
ll_maybe, _ = rf.loglikelihood(ctx, " maybe")
return ll_yes, ll_no, ll_maybe
def process_results(self, doc, results):
gold = doc["final_decision"]
ll_yes, ll_no, ll_maybe = results
pred = np.argmax(results)
return {
"acc": ["yes", "no", "maybe"][pred] == gold,
}
def aggregation(self):
return {
"acc" : mean
}
def higher_is_better(self):
return {
"acc" : True
}
import json import json
import random import random
import os import os
from lm_eval.base import Dataset from lm_eval.base import Task
from ..utils import sh from ..utils import sh
class QuAC(Dataset): class QuAC(Task):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -37,13 +37,8 @@ class QuAC(Dataset): ...@@ -37,13 +37,8 @@ class QuAC(Dataset):
def test_docs(self): def test_docs(self):
raise NotImplementedError("QuAC has no test docs.") raise NotImplementedError("QuAC has no test docs.")
def fewshot_examples(self, k):
traindocs = list(self.training_docs())
random.shuffle(traindocs)
return traindocs[:k]
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER" desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
return desc return desc
...@@ -59,11 +54,53 @@ class QuAC(Dataset): ...@@ -59,11 +54,53 @@ class QuAC(Dataset):
docs.append(doc) docs.append(doc)
return docs return docs
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: ' return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
if include_target:
text += doc['answer'] def doc_to_target(self, doc):
return text return doc['answer']
def evaluate(self, docs, lm): def construct_requests(self, doc, ctx):
pass """ Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
from . common import HFTask
from ..utils_stream import X, each, apply, join, filt, one
import collections import collections
import datasets import datasets
import numpy as np
from lm_eval.base import rf, mean
from . common import HFTask
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class RACE(HFTask): class RACE(HFTask):
...@@ -9,6 +23,7 @@ class RACE(HFTask): ...@@ -9,6 +23,7 @@ class RACE(HFTask):
DATASET_NAME = "high" DATASET_NAME = "high"
cache = {} cache = {}
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -20,7 +35,8 @@ class RACE(HFTask): ...@@ -20,7 +35,8 @@ class RACE(HFTask):
return True return True
def _collate_data(self, set): def _collate_data(self, set):
if set in self.cache: return self.cache[set] if set in self.cache:
return self.cache[set]
# One big issue with HF's implementation of this dataset: it makes a # One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it # separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage. # is shown that one document is made per passage.
...@@ -54,17 +70,80 @@ class RACE(HFTask): ...@@ -54,17 +70,80 @@ class RACE(HFTask):
# TODO: figure out description # TODO: figure out description
return "" return ""
def doc_to_text(self, doc, include_target=True): @classmethod
r = "Article:\n" + doc['article'] + '\n\n' def get_answer_option(cls, problem):
answer = cls.letter_to_num[problem['answer']]
r += doc['problems'] >> apply(enumerate) >> each( return problem['options'][answer]
lambda x: 'Q: ' + x[1]['question'] + '\n\nA:'
+ ((' ' + x[1]['options'][['A', 'B', 'C', 'D'].index(x[1]['answer'])]) \ @classmethod
if x[0] != len(doc['problems']) - 1 or include_target else '')) \ def last_problem(cls, doc):
>> join('\n\n') return doc['problems'][-1]
return r def doc_to_text(self, doc):
text = 'Article: ' + doc['article'] + '\n\n'
def evaluate(self, docs, lm, provide_description, num_fewshot): for problem in doc['problems'][:-1]:
# TODO: implement if problem['question'][-6:] == ' _ .':
raise NotImplementedError() text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
\ No newline at end of file else:
question = 'Question: ' + problem['question'] + '\n'
answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
text += question + answer
text += self.last_problem(doc)['question']
return text
def doc_to_target(self, doc):
return " " + self.get_answer_option(self.last_problem(doc))
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
problem = self.last_problem(doc)
ll_choices = [
rf.loglikelihood(ctx, " " + problem['options'][i])[0]
for i in range(4)
]
return ll_choices
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = self.letter_to_num[self.last_problem(doc)['answer']]
pred = np.argmax(results)
return {
"acc": int(pred == gold)
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
import json
import random
import os
from lm_eval.base import MultipleChoiceTask, rf, mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
class SATAnalogies(MultipleChoiceTask):
NEEDS_MANUAL_DL = True
def __init__(self):
super().__init__()
def download(self):
# We should be using a checksum here.
# The canonical sha256 hash is below:
# 9dece377d8d57253ef8c78370ff15de0bb1d9e90a82c815a67ba1e621e921bfc
if not os.path.exists('data/sat/SAT-package-V3.txt'):
raise NotImplementedError('SAT Analogies dataset is not provided. Follow instructions on https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) to locate.')
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
return []
def test_docs(self):
return []
def validation_docs(self):
data = []
with open("data/sat/SAT-package-V3.txt", "r") as f:
record = []
for line in f:
line = line.strip()
if len(line) == 0 and record:
data.append(record)
record = []
elif len(line) > 0 and line[0] == '#':
continue
else:
record.append(line)
data.append(record)
for record in data:
source = record[-8]
query = record[-7]
choices = record[-6:-1]
answer_key = record[-1]
doc = {
'source': source,
'query': query.split(' ')[:2],
'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in choices],
'gold': ['a','b','c','d','e'].index(answer_key.strip()),
}
yield doc
def fewshot_description(self):
# TODO: figure out actual description
return ""
def doc_to_text(self, doc):
return "{} is to {} as".format(*doc['query'])
import os
import json
from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf, mean
import zipfile
class SciQ(MultipleChoiceTask):
# Multiple languages and multiple years
def download(self):
if not os.path.exists('data/sciq'):
os.mkdir('data/sciq')
sh((
"wget https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip -O data/sciq/SciQ.zip"
))
with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
zf.extractall("data/sciq/")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
choices = [
doc["distractor1"],
doc["distractor2"],
doc["distractor3"],
doc["correct_answer"],
]
src = doc['support']
out_doc = {
"source" : src,
"query" : doc['question'],
"choices" : choices,
"gold" : 3,
}
return out_doc
def load_docs(self, textfilename):
with open(textfilename, 'r') as j:
docs = json.loads(j.read())
for record in docs:
yield self._convert_standard(record)
def fewshot_description(self):
# Average ctx length in labelled dataset is 238.9
# 2 few-shot exmamples pushes it beyond context window
return ""
def training_docs(self):
return self.load_docs("data/sciq/SciQ dataset-2 3/train.json")
def validation_docs(self):
return self.load_docs("data/sciq/SciQ dataset-2 3/valid.json")
def test_docs(self):
return self.load_docs("data/sciq/SciQ dataset-2 3/test.json")
def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
\ No newline at end of file
...@@ -26,19 +26,61 @@ class SQuAD(HFTask): ...@@ -26,19 +26,61 @@ class SQuAD(HFTask):
return self.data["validation"] return self.data["validation"]
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description
return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer." return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: ' return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
if include_target:
answer_list = doc['answers']['text'] def doc_to_target(self, doc):
if len(answer_list) > 0: answer_list = doc['answers']['text']
answer = answer_list[0] if len(answer_list) > 0:
else: answer = answer_list[0]
answer = 'unanswerable' else:
text += answer answer = 'unanswerable'
return text return answer
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Write evaluation function """ Uses RequestFactory to construct Requests and returns an iterable of
raise NotImplementedError() Requests which will be sent to the LM.
\ No newline at end of file
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Task
from ..utils import sh from ..utils import sh
import csv import csv
class StoryCloze(Dataset): class StoryCloze(Task):
def __init__(self): NEEDS_MANUAL_DL = True
self.download()
def download(self): def download(self):
#TODO: replace with Eye link #TODO: replace with Eye link
pass pass
...@@ -30,21 +30,63 @@ class StoryCloze(Dataset): ...@@ -30,21 +30,63 @@ class StoryCloze(Dataset):
def validation_docs(self): def validation_docs(self):
return self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv") return self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
def test_docs(self): def test_docs(self):
return self.load_doc("data/storycloze/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv") return self.load_doc("data/storycloze/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv")
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
if include_target: return ' '.join([*doc[1:5]])
return ' '.join([*doc[1:5],doc[int(doc[-1])-4]])
else:
return ' '.join([*doc[1:5]])
def evaluate(self, docs, lm): def doc_to_target(self, doc):
pass return " " + doc[int(doc[-1]) - 4]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
"""
To-do:
- WSC requires free-form generation
- ReCoRD
"""
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from . common import HFTask, yesno
from . common import HFTask, simple_accuracy_metric, yesno from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize
class BoolQ(HFTask): class BoolQ(HFTask):
...@@ -17,23 +25,41 @@ class BoolQ(HFTask): ...@@ -17,23 +25,41 @@ class BoolQ(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description
return "Read the following passages and answer each question with a yes or a no." return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \ return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
+ (yesno(doc['label']) if include_target else "")
def doc_to_target(self, doc):
return " " + yesno(doc['label'])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
golds = [doc["label"] for doc in docs]
preds = [] ll_yes, _ = rf.loglikelihood(ctx, ' yes')
for doc in docs: ll_no, _ = rf.loglikelihood(ctx, ' no')
ctx = self.fewshot_context(
doc=doc, return ll_yes, ll_no
provide_description=provide_description,
num_fewshot=num_fewshot, def process_results(self, doc, results):
) ll_yes, ll_no = results
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no')) gold = doc["label"]
return simple_accuracy_metric(preds=preds, golds=golds)
acc = 1. if (ll_yes > ll_no) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class CommitmentBank(HFTask): class CommitmentBank(HFTask):
...@@ -49,34 +75,62 @@ class CommitmentBank(HFTask): ...@@ -49,34 +75,62 @@ class CommitmentBank(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def doc_to_text(self, doc, include_target=True): def fewshot_description(self):
text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format( # TODO: figure out actual description
return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
"to the truth of the hypothesis. The three possible labels are true, false or neither."
def doc_to_text(self, doc):
return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
doc["premise"], doc["premise"],
doc["hypothesis"], doc["hypothesis"],
) )
if include_target:
# True = entailment
# False = contradiction
# Neither = neutral
text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def doc_to_target(self, doc):
golds = [doc["label"] for doc in docs] # True = entailment
preds = [] # False = contradiction
for doc in tqdm_lib.tqdm(docs): # Neither = neutral
ctx = self.fewshot_context( return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
doc=doc,
provide_description=provide_description, def construct_requests(self, doc, ctx):
num_fewshot=num_fewshot, ll_true, _ = rf.loglikelihood(ctx, ' True')
) ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
probs = np.array([ ll_false, _ = rf.loglikelihood(ctx, ' False')
lm.loglikelihood(ctx, ' true'),
lm.loglikelihood(ctx, ' neither'), return ll_true, ll_neither, ll_false
lm.loglikelihood(ctx, ' false'),
]) def process_results(self, doc, results):
preds.append(np.argmax(probs)) gold = doc["label"]
return simple_accuracy_metric(preds=preds, golds=golds) pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc,
"f1": (pred, gold)
}
def higher_is_better(self):
return {
"acc": True,
"f1": True
}
@classmethod
def cb_multi_fi(cls, items):
preds, golds = zip(*items)
preds = np.array(preds)
golds = np.array(golds)
f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
avg_f1 = mean([f11, f12, f13])
return avg_f1
def aggregation(self):
return {
"acc": mean,
"f1": self.cb_multi_fi,
}
class Copa(HFTask): class Copa(HFTask):
...@@ -92,32 +146,51 @@ class Copa(HFTask): ...@@ -92,32 +146,51 @@ class Copa(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def doc_to_text(self, doc, include_target=True): def fewshot_description(self):
# TODO: figure out actual description
return "Given a premise and one alternative with a causal relation to the premise and another without," \
"choose the more plausible alternative"
def doc_to_text(self, doc):
# Drop the period # Drop the period
connector = { connector = {
"cause": "because", "cause": "because",
"effect": "therefore", "effect": "therefore",
}[doc["question"]] }[doc["question"]]
text = doc["premise"].strip()[:-1] + f" {connector} " return doc["premise"].strip()[:-1] + f" {connector}"
if include_target:
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"] def doc_to_target(self, doc):
# Connect the sentences correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
text += self.convert_choice(correct_choice) # Connect the sentences
return text return " " + self.convert_choice(correct_choice)
def construct_requests(self, doc, ctx):
choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"])
ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2)
return ll_choice1, ll_choice2
def evaluate(self, docs, lm, provide_description, num_fewshot): def process_results(self, doc, results):
golds = [doc["label"] for doc in docs] gold = doc["label"]
preds = [] pred = np.argmax(results)
for doc in tqdm_lib.tqdm(docs): acc = 1. if pred == gold else 0.
ctx = self.fewshot_context(
doc=doc, return {
provide_description=provide_description, "acc": acc
num_fewshot=num_fewshot, }
)
choice1 = " " + self.convert_choice(doc["choice1"]) def higher_is_better(self):
choice2 = " " + self.convert_choice(doc["choice2"]) return {
preds.append(lm.loglikelihood(ctx, choice2) > lm.loglikelihood(ctx, choice1)) "acc": True
return simple_accuracy_metric(preds=preds, golds=golds) }
def aggregation(self):
return {
"acc": mean
}
@staticmethod @staticmethod
def convert_choice(choice): def convert_choice(choice):
...@@ -138,46 +211,139 @@ class MultiRC(HFTask): ...@@ -138,46 +211,139 @@ class MultiRC(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description
return "READING COMPREHENSION ANSWER KEY" return "READING COMPREHENSION ANSWER KEY"
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return f"{doc['paragraph']}\n\n{doc['question']}\n" \ return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
+ (self.format_answer(answer=doc["answer"], label=doc["label"])
if include_target else "") def doc_to_target(self, doc):
return self.format_answer(answer=doc["answer"], label=doc["label"])
@staticmethod @staticmethod
def format_answer(answer, label): def format_answer(answer, label):
label_str = "True" if label else "False" label_str = "yes" if label else "no"
return f"[{label_str}] {answer}" return f"{label_str}, {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
preds = [] true_choice = self.format_answer(answer=doc["answer"], label=True)
for doc in docs: false_choice = self.format_answer(answer=doc["answer"], label=False)
ctx = self.fewshot_context(
doc=doc, ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
provide_description=provide_description, ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
num_fewshot=num_fewshot,
) return ll_true_choice, ll_false_choice
true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False) def process_results(self, doc, results):
preds.append( pred = np.argmax(results)
lm.loglikelihood(ctx, f' {true_choice}') return {
> lm.loglikelihood(ctx, f' {false_choice}') "acc": (pred, doc)
) }
# Only count as correct if all answers are labeled correctly for each question def higher_is_better(self):
question_scoring_dict = {}
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return { return {
"major": acc, "acc": True
"minor": {"acc": acc}, }
"higher_is_better": True,
def aggregation(self):
return {
"acc": acc_all
}
class ReCoRD(HFTask):
DATASET_PATH = "super_glue"
DATASET_NAME = "record"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out actual description
return ""
def training_docs(self):
# In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
# Each doc consists of multiple answer candidates, each of which is scored yes/no.
# Hence, we one "doc" for each (context + passage, answer) pair.
# Moreover, we only use the correct answers for context packing
# (This is not an issue for evaluation, where we can directly score multiple candidates at once).
if self._training_docs is None:
self._training_docs = []
for doc in self.data["train"]:
for entity in list(set(doc["entities"])):
self._training_docs.append({
"passage": doc["passage"],
"query": doc["query"],
"entity": entity,
"label": entity in doc["answers"],
})
return self._training_docs
def validation_docs(self):
for doc in self.data["validation"]:
for entity in list(set(doc["entities"])):
yield {
"passage": doc["passage"],
"query": doc["query"],
"entity": entity,
"label": entity in doc["answers"],
}
def doc_to_text(self, doc):
initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
text = initial_text + "\n\n"
for highlight in highlights:
text += f" - {highlight}.\n"
return text
@classmethod
def format_answer(cls, query, entity):
return f' - {query}'.replace("@placeholder", entity)
def doc_to_target(self, doc):
return self.format_answer(query=doc["query"], entity=doc["entity"])
def construct_requests(self, doc, ctx):
requests = [
rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
for entity in doc["entity"]
]
return requests
def process_results(self, doc, results):
# ReCoRD's evaluation is actually deceptively simple:
# - Pick the maximum likelihood prediction entity
# - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples
max_idx = np.argmax(np.array(results))
prediction = doc["entities"][max_idx]
gold_label_set = list(set(doc["answers"]))
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
return {
"f1": f1,
"em": em,
}
def higher_is_better(self):
return {
"f1": True,
"em": True,
}
def aggregation(self):
return {
"f1": mean,
"em": mean,
} }
...@@ -194,31 +360,51 @@ class WordsInContext(HFTask): ...@@ -194,31 +360,51 @@ class WordsInContext(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def doc_to_text(self, doc, include_target=True): def fewshot_description(self):
text = "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \ # TODO: figure out actual description
" two sentences above?\nanswer:".format( return ""
def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
" two sentences above?\nAnswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
doc["sentence1"][doc["start1"]:doc["end1"]], doc["sentence1"][doc["start1"]:doc["end1"]],
) )
if include_target:
text += " {}".format({0: "no", 1: "yes"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def doc_to_target(self, doc):
golds = [doc["label"] for doc in docs] return " {}".format({0: "no", 1: "yes"}[doc["label"]])
preds = []
for doc in tqdm_lib.tqdm(docs): def construct_requests(self, doc, ctx):
ctx = self.fewshot_context( ll_yes, _ = rf.loglikelihood(ctx, ' yes')
doc=doc, ll_no, _ = rf.loglikelihood(ctx, ' no')
provide_description=provide_description,
num_fewshot=num_fewshot, return ll_yes, ll_no
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no')) def process_results(self, doc, results):
return simple_accuracy_metric(preds=preds, golds=golds) ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class SGWinogradSchemaChallenge(HFTask): class SGWinogradSchemaChallenge(HFTask):
# Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
# binary version of the task.
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "wsc" DATASET_NAME = "wsc"
...@@ -234,10 +420,10 @@ class SGWinogradSchemaChallenge(HFTask): ...@@ -234,10 +420,10 @@ class SGWinogradSchemaChallenge(HFTask):
def training_docs(self): def training_docs(self):
if self.has_training_docs(): if self.has_training_docs():
if self._training_docs is None: if self._training_docs is None:
# GPT-3 Paper's format only uses positive examples # GPT-3 Paper's format only uses positive examples for fewshot "training"
self._training_docs = [ self._training_docs = [
doc for doc in doc for doc in
self._load_nlp_dataset()["train"] self.data["train"]
if doc["label"] if doc["label"]
] ]
return self._training_docs return self._training_docs
...@@ -248,59 +434,47 @@ class SGWinogradSchemaChallenge(HFTask): ...@@ -248,59 +434,47 @@ class SGWinogradSchemaChallenge(HFTask):
"For each passage, you must identify which noun the pronoun marked in *bold*" \ "For each passage, you must identify which noun the pronoun marked in *bold*" \
" refers to.\n=====" " refers to.\n====="
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
raw_passage = doc["text"] raw_passage = doc["text"]
passage = ( # NOTE: HuggingFace span indices are word-based not character-based.
raw_passage[:doc["span2_index"]] pre = " ".join(raw_passage.split()[:doc["span2_index"]])
+ "*{}*".format(doc["span2_text"]) post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
+ raw_passage[doc["span2_index"] + len(doc["span2_text"]):] passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
) noun = doc["span1_text"]
pronoun = doc["span2_text"] pronoun = doc["span2_text"]
text = ( text = (
f"Passage: {passage}\n" f"Passage: {passage}\n"
+ f"Question: In the passage above, what does the pronoun \"*{pronoun}*\" refer to?\n" + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
+ "Answer:" + "Answer:"
) )
if include_target:
text += " {}".format(doc["span1_text"])
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def doc_to_target(self, doc):
golds = [doc["label"] for doc in docs] return " " + yesno(doc['label'])
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
to_predict = " " + doc["span1_text"]
num_tokens = len(lm.tokenizer.tokenize(to_predict))
generated = lm.generate(
context=ctx,
max_gen_length=num_tokens,
)
preds.append(1 if generated == to_predict else 0)
return simple_accuracy_metric(preds=preds, golds=golds)
class RTE(HFTask):
DATASET_PATH = "super_glue"
DATASET_NAME = "rte"
def fewshot_description(self): def construct_requests(self, doc, ctx):
#TODO: implement
pass ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
def doc_to_text(self, doc, include_target=True):
if include_target: return ll_yes, ll_no
if doc['label'] == 0:
answer = 'True' def process_results(self, doc, results):
else: ll_yes, ll_no = results
answer = 'False' gold = doc["label"]
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
else:
return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
def evaluate(self, docs, lm, provide_description, num_fewshot):
#TODO:
pass
acc = 1. if (ll_yes > ll_no) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
import os
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Task, mean, rf
from ..utils import sh from ..utils import sh
class TriviaQA(Dataset): class TriviaQA(Task):
def __init__(self):
self.download()
def download(self): def download(self):
#pass if not os.path.exists('data/triviaqa'):
#TODO: don't download if files already there sh("""
sh(""" mkdir -p data/triviaqa
mkdir -p data/triviaqa wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz tar -xf data/triviaqa/trivia_qa-unfiltered.tar.gz
tar -xf data/triviaqa/trivia_qa-unfiltered.tar.gz mv triviaqa-unfiltered/ data/triviaqa/
mv triviaqa-unfiltered/ data/triviaqa/ """)
""")
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -23,7 +21,7 @@ class TriviaQA(Dataset): ...@@ -23,7 +21,7 @@ class TriviaQA(Dataset):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def training_docs(self): def training_docs(self):
return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-train.json'))['Data'] return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-train.json'))['Data']
...@@ -35,13 +33,45 @@ class TriviaQA(Dataset): ...@@ -35,13 +33,45 @@ class TriviaQA(Dataset):
return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data'] return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data']
def fewshot_description(self): def fewshot_description(self):
pass # TODO: figure out fewshot description
return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
if include_target: return ''.join(['Q:', doc['Question'], '\n\n','A:'])
return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
else: def doc_to_target(self, doc):
return ''.join(['Q: ', doc['Question'], '\n\n','A: ']) return " " + doc['Answer']['Value']
def evaluate(self, docs, lm):
pass def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
# we can do this because if the prefix is acceptable by isgreedy, we can stop looking
aliases.sort()
ret = [aliases[0]]
for alias in aliases[1:]:
if not alias.startswith(ret[-1]):
ret.append(alias)
return ret
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc['Answer']['Aliases']):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
def process_results(self, doc, results):
return {
"acc": float(any(results))
}
def aggregation(self):
return {
"acc": mean,
}
def higher_is_better(self):
return {
"acc": True
}
from . common import HFTask from . common import HFTask
from lm_eval.base import mean, rf
class WebQs(HFTask): class WebQs(HFTask):
DATASET_PATH = "web_questions" DATASET_PATH = "web_questions"
...@@ -17,16 +18,45 @@ class WebQs(HFTask): ...@@ -17,16 +18,45 @@ class WebQs(HFTask):
# TODO: figure out description # TODO: figure out description
return "" return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
print(doc) return "Question: " + doc['question'] + '\nAnswer:'
q = "Q: " + doc['question'] + '\n'
def doc_to_target(self, doc):
# this picks one answer to be the "correct" one, despite sometimes # this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible. # multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly # TODO: make sure we're actually handling multi-answer correctly
a = "A:" + ((" " + doc['answers'][0]) if include_target else '') return " " + doc['answers'][0]
return q + a
def _remove_prefixes(self, aliases):
def evaluate(self, docs, lm, provide_description, num_fewshot): # Optimization: Remove any alias that has a strict prefix elsewhere in the list
# TODO: implement # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
raise NotImplementedError() aliases.sort()
\ No newline at end of file ret = [aliases[0]]
for alias in aliases[1:]:
if not alias.startswith(ret[-1]):
ret.append(alias)
return ret
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc['answers']):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
def process_results(self, doc, results):
return {
"acc": float(any(results))
}
def aggregation(self):
return {
"acc": mean,
}
def higher_is_better(self):
return {
"acc": True
}
\ No newline at end of file
...@@ -9,22 +9,120 @@ class WikiText103(NLP_TASK): ...@@ -9,22 +9,120 @@ class WikiText103(NLP_TASK):
NLP_NAME = "wikitext-103-raw-v1" NLP_NAME = "wikitext-103-raw-v1"
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
return "" return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return doc['text'] # TODO: implement
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass pass
def doc_to_target(self, doc):
# TODO: implement
pass
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
class WikiText2(NLP_TASK): class WikiText2(NLP_TASK):
NLP_PATH = "wikitext" NLP_PATH = "wikitext"
NLP_NAME = "wikitext-2-raw-v1" NLP_NAME = "wikitext-2-raw-v1"
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description
return "" return ""
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
return doc['text'] # TODO: implement
def evaluate(self, docs, lm, provide_description, num_fewshot): pass
pass
\ No newline at end of file def doc_to_target(self, doc):
# TODO: implement
pass
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from . common import HFTask
from sklearn.metrics import f1_score, matthews_corrcoef from lm_eval.base import rf, mean
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno """
This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
Reference: https://arxiv.org/abs/1806.02847
"""
class Winogrande(HFTask): class Winogrande(HFTask):
DATASET_PATH = "winogrande" DATASET_PATH = "winogrande"
...@@ -17,34 +22,80 @@ class Winogrande(HFTask): ...@@ -17,34 +22,80 @@ class Winogrande(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self): def fewshot_description(self):
if self.has_training_docs(): # TODO: redo description
return self.data["train"] return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
def validation_docs(self): @classmethod
if self.has_validation_docs(): def partial_context(cls, doc):
return self.data["validation"] # Substitute the pronoun in the sentence with each candidate choice
# and ignore everything after.
pronoun_loc = doc["sentence"].index("_")
context1 = doc["sentence"][:pronoun_loc] + doc["option1"]
context2 = doc["sentence"][:pronoun_loc] + doc["option2"]
return context1, context2
def test_docs(self): @classmethod
if self.has_test_docs(): def partial_target(cls, doc):
return self.data["test"] # The target is everything after the document specified pronoun.
pronoun_loc = doc["sentence"].index("_") + 1
return doc["sentence"][pronoun_loc:].strip()
def fewshot_description(self): def doc_to_text(self, doc):
return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in." context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc):
return self.partial_target(doc)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
answer = int(doc["answer"]) - 1 # `- 1` b/c doc["answer"] ∈ {'1', '2'}
return {
"acc": np.argmax(results) == answer
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def doc_to_text(self, doc, include_target=True): def higher_is_better(self):
text = doc['sentence'] """
if include_target: :returns: {str: bool}
answer_n = doc['answer'] A dictionary where keys are the names of submetrics and values are
if answer_n == '1': whether a higher value of the submetric is better
answer = doc['option1'] """
elif answer_n == '2': return {
answer = doc['option2'] "acc": True
else: }
raise ValueError("Winogrande from HF datasets contained an invalid answer key")
text = text.replace("_", answer)
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
import json import numpy as np
import random import random
import os from lm_eval.base import rf, mean
from lm_eval.base import Dataset from . common import HFTask
from ..utils import sh
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
class WinogradSchemaChallenge273(Dataset): as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
def __init__(self): See: https://arxiv.org/abs/1806.02847
super().__init__() """
def download(self):
if not os.path.exists('data/wsc273'): class WinogradSchemaChallenge273(HFTask):
sh(""" DATASET_PATH = "winograd_wsc"
mkdir -p data/wsc273 DATASET_NAME = "wsc273"
wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
""") upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
def has_training_docs(self):
return False def __init__(self):
super().__init__()
def has_validation_docs(self): self.data = self.__clean_data()
return False
def __clean_data(self):
def has_test_docs(self): # The HF implementation of `wsc273` is not `partial evaluation` friendly.
return True data = []
for doc in self.data["test"]:
def training_docs(self): doc["text"] = doc["text"].replace(" ", " ")
return [] doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
def validation_docs(self): data.append(doc)
return [] return {"test": data}
def test_docs(self): def __normalize_option(self, option, doc):
myjson = json.load(open('data/wsc273/wsc273.json')) # Append `'s` to possessive determiner based options.
return self.load_doc(myjson) if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
def fewshot_description(self): # Appropriately lowercase the pronoun in the option.
# This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy, pronoun = option.split()[0]
# to meet the needs of this particular task. start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
def load_doc(self, myjson): return option
docs = []
for i in range(0, 273 * 2, 2): def has_training_docs(self):
item1 = myjson[i] return False
item2 = myjson[i+1]
def has_validation_docs(self):
if item1['question_id'] != item2['question_id']: return False
raise ValueError("WSC273 has missing completion pair.")
def has_test_docs(self):
question_id = item1['question_id'] return True
if item1['correctness'] == True: def fewshot_examples(self, k):
doc = { # NOTE: `super().fewshot_examples` samples from training docs which are
'id': question_id, # not available for this test-set-only dataset.
'completions': { return random.sample(list(self.test_docs()), k)
'T': item1['substitution'],
'F': item2['substitution'], def fewshot_description(self):
}, # TODO: redo description
} return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
if item2['correctness'] == True: @classmethod
doc = { def partial_context(cls, doc):
'id': question_id, # Substitute the pronoun in the original text with each candidate
'completions': { # choice and ignore everything after.
'F': item1['substitution'], context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
'T': item2['substitution'], context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
}, return context1, context2
}
@classmethod
docs.append(doc) def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
return docs start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return doc["text"][start_index:].strip()
def doc_to_text(self, doc, include_target=True):
# WSC273 is currently only writing out full examples. Partial evaluation needs implementing. def doc_to_text(self, doc):
text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.' context1, context2 = self.partial_context(doc)
return text return context1 + '\n' + context2 + '\n'
def evaluate(self, docs, lm): def doc_to_target(self, doc):
# TODO: Write evaluation function return self.partial_target(doc)
raise NotImplementedError()
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return {
"acc": np.argmax(results) == doc["label"]
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
import os import os
import re
class ExitCodeError(Exception): class ExitCodeError(Exception):
...@@ -25,3 +26,27 @@ def simple_parse_args_string(args_string): ...@@ -25,3 +26,27 @@ def simple_parse_args_string(args_string):
k, v = arg.split("=") k, v = arg.split("=")
args_dict[k] = v args_dict[k] = v
return args_dict return args_dict
def join_iters(iters):
for iter in iters:
yield from iter
def chunks(iter, n):
arr = []
for x in iter:
arr.append(x)
if len(arr) == n:
yield arr
arr = []
if arr: yield arr
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
string = string.replace("( ", "(")
string = string.replace("\" ", "\"")
string = string.replace(" \"", "\"")
string = re.sub(r" (['.,])", r"\1", string)
return string
\ No newline at end of file
import os
from functools import reduce
import operator
import lm_dataformat as lmd
from tqdm import tqdm
import json
class ExitCodeError(Exception): pass
def sh(x):
if os.system(x): raise ExitCodeError()
def ls(x):
return [x + '/' + fn for fn in os.listdir(x)]
def lsr(x):
if os.path.isdir(x):
return reduce(operator.add, map(lsr, ls(x)), [])
else:
return [x]
def fwrite(fname, content):
with open(fname, 'w') as fh:
fh.write(content)
def fread(fname):
with open(fname) as fh:
return fh.read()
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class filt:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(filter(self.f, other))
class apply:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return self.f(other)
class one:
def __rrshift__(self, other):
try:
if isinstance(other, list):
assert len(other) == 1
return other[0]
return next(other)
except:
return None
class join:
def __init__(self, sep):
self.sep = sep
def __rrshift__(self, other):
if other is None: return
try:
return self.sep.join(other)
except:
return None
Y = object()
def id(x):
return x
class Reflective:
def __getattribute__(self, f):
def _fn(*args, **kwargs):
return lambda x: x.__getattribute__(f)(*args, **kwargs)
return _fn
def __getitem__(self, a):
return lambda x: x[a]
def __mul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x * y
return _f
return lambda x: x * other
def __rmul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y * x
return _f
return lambda x: other * x
def __add__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x + y
return _f
return lambda x: x + other
def __radd__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y + x
return _f
return lambda x: other + x
# (b -> a -> b) -> b -> [a] -> b
def foldl(f, init, arr):
curr = init
for elem in arr:
curr = f(curr, elem)
return curr
# (a -> b -> b) -> b -> [a] -> b
def foldr(f, init, arr):
curr = init
for elem in arr[::-1]:
curr = f(elem, curr)
return curr
def comp(*fs):
if len(fs) == 1:
return fs[0]
def _f(x):
for f in fs[::-1]:
x = f(x)
return x
return _f
X = Reflective()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment