Commit 1f8a8c1d authored by jon-tow's avatar jon-tow
Browse files

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into remove-dataset

parents b4c0275d b0acb337
......@@ -51,17 +51,34 @@ class QuAC(Task):
raise NotImplementedError("QuAC has no test docs.")
def _process_doc(self, doc):
doc["title"] = doc['title'] + ' - ' + doc['section_title']
doc["title"] = doc["title"] + " - " + doc["section_title"]
return doc
def doc_to_text(self, doc):
return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
return (
"TITLE: "
+ doc["title"]
+ "\n"
+ "PARAGRAPH: "
+ doc["paragraph"]
+ "\n\n"
+ "Q: "
+ doc["question"]
+ "\n\n"
+ "A: "
)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["paragraph"]
def doc_to_target(self, doc):
return doc['answer']
return doc["answer"]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -72,7 +89,7 @@ class QuAC(Task):
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
raise NotImplementedError("Evaluation not implemented")
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -85,7 +102,7 @@ class QuAC(Task):
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
raise NotImplementedError("Evaluation not implemented")
def aggregation(self):
"""
......@@ -94,7 +111,7 @@ class QuAC(Task):
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
raise NotImplementedError("Evaluation not implemented")
def higher_is_better(self):
"""
......@@ -103,4 +120,4 @@ class QuAC(Task):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
raise NotImplementedError("Evaluation not implemented")
......@@ -40,7 +40,7 @@ class RACE(Task):
DATASET_NAME = "high"
cache = {}
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
def has_training_docs(self):
return True
......@@ -59,17 +59,27 @@ class RACE(Task):
# is shown that one document is made per passage.
r = collections.defaultdict(list)
for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
r[item['article']].append(item)
res = list(r.values() >> each(lambda x: {
'article': x[0]['article'],
'problems': x >> each(lambda y: {
'question': y['question'],
'answer': y['answer'],
'options': y['options'],
})
}))
for item in datasets.load_dataset(
path=self.DATASET_PATH, name=self.DATASET_NAME
)[set]:
r[item["article"]].append(item)
res = list(
r.values()
>> each(
lambda x: {
"article": x[0]["article"],
"problems": x
>> each(
lambda y: {
"question": y["question"],
"answer": y["answer"],
"options": y["options"],
}
),
}
)
)
self.cache[set] = res
return res
......@@ -85,30 +95,38 @@ class RACE(Task):
@classmethod
def get_answer_option(cls, problem):
answer = cls.letter_to_num[problem['answer']]
return problem['options'][answer]
answer = cls.letter_to_num[problem["answer"]]
return problem["options"][answer]
@classmethod
def last_problem(cls, doc):
return doc['problems'][-1]
return doc["problems"][-1]
def doc_to_text(self, doc):
text = 'Article: ' + doc['article'] + '\n\n'
for problem in doc['problems'][:-1]:
if problem['question'][-6:] == ' _ .':
text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
text = "Article: " + doc["article"] + "\n\n"
for problem in doc["problems"][:-1]:
if problem["question"][-6:] == " _ .":
text += (
problem["question"][-5:] + self.get_answer_option(problem) + "\n"
)
else:
question = 'Question: ' + problem['question'] + '\n'
answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
question = "Question: " + problem["question"] + "\n"
answer = "Answer: " + self.get_answer_option(problem) + "\n"
text += question + answer
text += self.last_problem(doc)['question']
text += self.last_problem(doc)["question"]
return text
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["article"]
def doc_to_target(self, doc):
return " " + self.get_answer_option(self.last_problem(doc))
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -120,8 +138,7 @@ class RACE(Task):
"""
problem = self.last_problem(doc)
ll_choices = [
rf.loglikelihood(ctx, " " + problem['options'][i])[0]
for i in range(4)
rf.loglikelihood(ctx, " " + problem["options"][i])[0] for i in range(4)
]
return ll_choices
......@@ -135,11 +152,9 @@ class RACE(Task):
:param results:
The results of the requests created in construct_requests.
"""
gold = self.letter_to_num[self.last_problem(doc)['answer']]
gold = self.letter_to_num[self.last_problem(doc)["answer"]]
pred = np.argmax(results)
return {
"acc": int(pred == gold)
}
return {"acc": int(pred == gold)}
def aggregation(self):
"""
......@@ -147,9 +162,7 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
return {"acc": mean}
def higher_is_better(self):
"""
......@@ -157,6 +170,4 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
return {"acc": True}
......@@ -59,11 +59,19 @@ class SATAnalogies(MultipleChoiceTask):
def _process_doc(self, doc):
return {
'source': doc['source'],
'query': doc['stem'].split(' ')[:2],
'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in doc["choices"]],
'gold': ['a', 'b', 'c', 'd', 'e'].index(doc['solution'].strip()),
"source": doc["source"],
"query": doc["stem"].split(" ")[:2],
"choices": [
"{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
],
"gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
}
def doc_to_text(self, doc):
return "{} is to {} as".format(*doc['query'])
return "{} is to {} as".format(*doc["query"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + "\n" + " ".join(doc["query"])
......@@ -54,10 +54,10 @@ class SciQ(MultipleChoiceTask):
doc["distractor3"],
doc["correct_answer"],
]
src = doc['support']
src = doc["support"]
out_doc = {
"source": src,
"query": doc['question'],
"query": doc["question"],
"choices": choices,
"gold": 3,
}
......@@ -65,3 +65,9 @@ class SciQ(MultipleChoiceTask):
def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + " " + doc["query"]
......@@ -49,7 +49,9 @@ class SQuAD2(Task):
DATASET_NAME = None
# HF changed squad on us so we have to make sure we aren't running the old one
assert version.parse(datasets.__version__) >= version.parse("1.11.0"), "datasets v1.11.0 or later required for SQuAD"
assert version.parse(datasets.__version__) >= version.parse(
"1.11.0"
), "datasets v1.11.0 or later required for SQuAD"
def has_training_docs(self):
return True
......@@ -67,18 +69,35 @@ class SQuAD2(Task):
return self.dataset["validation"]
def doc_to_text(self, doc):
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'
return (
"Title: "
+ doc["title"]
+ "\n\n"
+ "Background: "
+ doc["context"]
+ "\n\n"
+ "Question: "
+ doc["question"]
+ "\n\n"
+ "Answer:"
)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["context"]
def doc_to_target(self, doc):
answer_list = doc['answers']['text']
answer_list = doc["answers"]["text"]
if len(answer_list) > 0:
answer = answer_list[0]
else:
answer = 'unanswerable'
answer = "unanswerable"
return " " + answer
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -88,7 +107,7 @@ class SQuAD2(Task):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation = rf.greedy_until(ctx, ['\n'])
continuation = rf.greedy_until(ctx, ["\n"])
is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
return continuation, is_unanswerable
......@@ -107,25 +126,46 @@ class SQuAD2(Task):
no_answer_probability = exp(logprob_unanswerable)
predictions = {
'id': doc['id'],
'prediction_text': continuation,
'no_answer_probability': no_answer_probability,
"id": doc["id"],
"prediction_text": continuation,
"no_answer_probability": no_answer_probability,
}
references = {
'id': doc['id'],
'answers': doc['answers'],
"id": doc["id"],
"answers": doc["answers"],
}
return {
'exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'HasAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'NoAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer
'best_exact': (predictions, references), # Best exact match (with varying threshold)
'best_f1': (predictions, references), # Best F1 (with varying threshold)
"exact": (
predictions,
references,
), # Exact match (the normalized answer exactly match the gold answer)
"f1": (
predictions,
references,
), # The F-score of predicted tokens versus the gold answer
"HasAns_exact": (
predictions,
references,
), # Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1": (
predictions,
references,
), # The F-score of predicted tokens versus the gold answer
"NoAns_exact": (
predictions,
references,
), # Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1": (
predictions,
references,
), # The F-score of predicted tokens versus the gold answer
"best_exact": (
predictions,
references,
), # Best exact match (with varying threshold)
"best_f1": (predictions, references), # Best F1 (with varying threshold)
}
def aggregation(self):
......@@ -135,14 +175,30 @@ class SQuAD2(Task):
functions that aggregate a list of metrics
"""
return {
'exact': partial(_squad_agg, 'exact'), # Exact match (the normalized answer exactly match the gold answer)
'f1': partial(_squad_agg, 'f1'), # The F-score of predicted tokens versus the gold answer
'HasAns_exact': partial(_squad_agg, 'HasAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': partial(_squad_agg, 'HasAns_f1'), # The F-score of predicted tokens versus the gold answer
'NoAns_exact': partial(_squad_agg, 'NoAns_exact'), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': partial(_squad_agg, 'NoAns_f1'), # The F-score of predicted tokens versus the gold answer
'best_exact': partial(_squad_agg, 'best_exact'), # Best exact match (with varying threshold)
'best_f1': partial(_squad_agg, 'best_f1'), # Best F1 (with varying threshold)
"exact": partial(
_squad_agg, "exact"
), # Exact match (the normalized answer exactly match the gold answer)
"f1": partial(
_squad_agg, "f1"
), # The F-score of predicted tokens versus the gold answer
"HasAns_exact": partial(
_squad_agg, "HasAns_exact"
), # Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1": partial(
_squad_agg, "HasAns_f1"
), # The F-score of predicted tokens versus the gold answer
"NoAns_exact": partial(
_squad_agg, "NoAns_exact"
), # Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1": partial(
_squad_agg, "NoAns_f1"
), # The F-score of predicted tokens versus the gold answer
"best_exact": partial(
_squad_agg, "best_exact"
), # Best exact match (with varying threshold)
"best_f1": partial(
_squad_agg, "best_f1"
), # Best F1 (with varying threshold)
}
def higher_is_better(self):
......@@ -152,12 +208,12 @@ class SQuAD2(Task):
whether a higher value of the submetric is better
"""
return {
'exact': True, # Exact match (the normalized answer exactly match the gold answer)
'f1': True, # The F-score of predicted tokens versus the gold answer
'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer
'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer
'best_exact': True, # Best exact match (with varying threshold)
'best_f1': True, # Best F1 (with varying threshold)
"exact": True, # Exact match (the normalized answer exactly match the gold answer)
"f1": True, # The F-score of predicted tokens versus the gold answer
"HasAns_exact": True, # Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1": True, # The F-score of predicted tokens versus the gold answer
"NoAns_exact": True, # Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1": True, # The F-score of predicted tokens versus the gold answer
"best_exact": True, # Best exact match (with varying threshold)
"best_f1": True, # Best F1 (with varying threshold)
}
......@@ -65,12 +65,27 @@ class StoryCloze(Task):
return self.dataset["test"]
def doc_to_text(self, doc):
return ' '.join([
return " ".join(
[
doc["input_sentence_1"],
doc["input_sentence_2"],
doc["input_sentence_3"],
doc["input_sentence_4"],
])
]
)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return " ".join(
[
doc["input_sentence_1"],
doc["input_sentence_2"],
doc["input_sentence_3"],
doc["input_sentence_4"],
]
)
def doc_to_target(self, doc):
clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
......@@ -78,7 +93,7 @@ class StoryCloze(Task):
return " " + clozes[doc["answer_right_ending"] - 1]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -89,10 +104,7 @@ class StoryCloze(Task):
part of the document for `doc`.
"""
clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in clozes
]
lls = [rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in clozes]
return lls
def process_results(self, doc, results):
......@@ -106,10 +118,8 @@ class StoryCloze(Task):
The results of the requests created in construct_requests.
"""
gold = doc["answer_right_ending"] - 1
acc = 1. if np.argmax(results) == gold else 0.
return {
"acc": acc
}
acc = 1.0 if np.argmax(results) == gold else 0.0
return {"acc": acc}
def aggregation(self):
"""
......@@ -117,9 +127,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
return {"acc": mean}
def higher_is_better(self):
"""
......@@ -127,9 +135,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
return {"acc": True}
class StoryCloze2016(StoryCloze):
......
......@@ -57,13 +57,19 @@ class BoolQ(Task):
def doc_to_text(self, doc):
return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["passage"]
def doc_to_target(self, doc):
return " " + yesno(doc['label'])
return " " + yesno(doc["label"])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
......@@ -71,21 +77,15 @@ class BoolQ(Task):
ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return {
"acc": acc
}
return {"acc": acc}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
def aggregation(self):
return {
"acc": mean
}
return {"acc": mean}
class CommitmentBank(Task):
......@@ -123,27 +123,21 @@ class CommitmentBank(Task):
return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, ' True')
ll_false, _ = rf.loglikelihood(ctx, ' False')
ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
return ll_true, ll_false, ll_neither
def process_results(self, doc, results):
gold = doc["label"]
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
acc = 1.0 if pred == gold else 0.0
return {
"acc": acc,
"f1": (pred, gold)
}
return {"acc": acc, "f1": (pred, gold)}
def higher_is_better(self):
return {
"acc": True,
"f1": True
}
return {"acc": True, "f1": True}
@classmethod
def cb_multi_fi(cls, items):
......@@ -210,21 +204,15 @@ class Copa(Task):
def process_results(self, doc, results):
gold = doc["label"]
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
acc = 1.0 if pred == gold else 0.0
return {
"acc": acc
}
return {"acc": acc}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
def aggregation(self):
return {
"acc": mean
}
return {"acc": mean}
@staticmethod
def convert_choice(choice):
......@@ -268,27 +256,21 @@ class MultiRC(Task):
true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False)
ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
ll_true_choice, _ = rf.loglikelihood(ctx, f" {true_choice}")
ll_false_choice, _ = rf.loglikelihood(ctx, f" {false_choice}")
return ll_true_choice, ll_false_choice
def process_results(self, doc, results):
ll_true_choice, ll_false_choice = results
pred = ll_true_choice > ll_false_choice
return {
"acc": (pred, doc)
}
return {"acc": (pred, doc)}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
def aggregation(self):
return {
"acc": acc_all
}
return {"acc": acc_all}
class ReCoRD(Task):
......@@ -337,7 +319,7 @@ class ReCoRD(Task):
@classmethod
def format_answer(cls, query, entity):
return f' - {query}'.replace("@placeholder", entity)
return f" - {query}".replace("@placeholder", entity)
def doc_to_target(self, doc):
# We only output the first correct entity in a doc
......@@ -359,8 +341,12 @@ class ReCoRD(Task):
prediction = doc["entities"][max_idx]
gold_label_set = doc["answers"]
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
f1 = metric_max_over_ground_truths(
squad_metrics.compute_f1, prediction, gold_label_set
)
em = metric_max_over_ground_truths(
squad_metrics.compute_exact, prediction, gold_label_set
)
return {
"f1": f1,
......@@ -403,19 +389,21 @@ class WordsInContext(Task):
return self.dataset["validation"]
def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
return (
"Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
" two sentences above?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["sentence1"][doc["start1"]:doc["end1"]],
doc["sentence1"][doc["start1"] : doc["end1"]],
)
)
def doc_to_target(self, doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
......@@ -423,21 +411,15 @@ class WordsInContext(Task):
ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return {
"acc": acc
}
return {"acc": acc}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
def aggregation(self):
return {
"acc": mean
}
return {"acc": mean}
class SGWinogradSchemaChallenge(Task):
......@@ -461,9 +443,7 @@ class SGWinogradSchemaChallenge(Task):
if self._training_docs is None:
# GPT-3 Paper's format only uses positive examples for fewshot "training"
self._training_docs = [
doc for doc in
self.dataset["train"]
if doc["label"]
doc for doc in self.dataset["train"] if doc["label"]
]
return self._training_docs
......@@ -473,25 +453,25 @@ class SGWinogradSchemaChallenge(Task):
def doc_to_text(self, doc):
raw_passage = doc["text"]
# NOTE: HuggingFace span indices are word-based not character-based.
pre = " ".join(raw_passage.split()[:doc["span2_index"]])
post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
pre = " ".join(raw_passage.split()[: doc["span2_index"]])
post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
noun = doc["span1_text"]
pronoun = doc["span2_text"]
text = (
f"Passage: {passage}\n"
+ f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
+ f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
+ "Answer:"
)
return text
def doc_to_target(self, doc):
return " " + yesno(doc['label'])
return " " + yesno(doc["label"])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
......@@ -499,18 +479,12 @@ class SGWinogradSchemaChallenge(Task):
ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return {
"acc": acc
}
return {"acc": acc}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
def aggregation(self):
return {
"acc": mean
}
return {"acc": mean}
......@@ -41,44 +41,57 @@ def create_tasks_from_benchmarks(benchmark_dict):
:return: {task_name: task}
e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
"""
def version_of(dataset, language_pair):
if language_pair[-2:] in ["zh", "ja"]:
return 1 # changed to use jieba/nagisa
return 0
return {
f"{dataset}-{language_pair}": create_translation_task(dataset, language_pair, version_of(dataset, language_pair))
f"{dataset}-{language_pair}": create_translation_task(
dataset, language_pair, version_of(dataset, language_pair)
)
for dataset, language_pairs in benchmark_dict.items()
for language_pair in language_pairs
}
########################################
# Language Specifics
########################################
def zh_split(zh_text: List[str]) -> List[str]:
"""Chinese splitting"""
import jieba
return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
def ja_split(ja_text: List[str]) -> List[str]:
"""Japanese splitting"""
import nagisa
return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split}
########################################
# Tasks
########################################
def create_translation_task(dataset, language_pair, version=0):
class TranslationTask(GeneralTranslationTask):
VERSION = version
def __init__(self):
super().__init__(dataset, language_pair)
return TranslationTask
class GeneralTranslationTask(Task):
VERSION = 0
......@@ -92,8 +105,9 @@ class GeneralTranslationTask(Task):
def download(self, data_dir=None, cache_dir=None, download_mode=None):
# This caches in the users home dir automatically
self.src_file, self.ref_file = \
sacrebleu.download_test_set(self.sacrebleu_dataset, self.sacrebleu_language_pair)
self.src_file, self.ref_file = sacrebleu.download_test_set(
self.sacrebleu_dataset, self.sacrebleu_language_pair
)
self.src_data, self.ref_data = [
[line.rstrip() for line in sacrebleu.smart_open(file)]
for file in (self.src_file, self.ref_file)
......@@ -117,10 +131,9 @@ class GeneralTranslationTask(Task):
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return [{
"src": src,
"ref": ref
} for src, ref in zip(self.src_data, self.ref_data)]
return [
{"src": src, "ref": ref} for src, ref in zip(self.src_data, self.ref_data)
]
def doc_to_text(self, doc):
language_codes = self.sacrebleu_language_pair.split("-")
......@@ -128,12 +141,18 @@ class GeneralTranslationTask(Task):
tar_lang = code_to_language(language_codes[1])
return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["src"]
def doc_to_target(self, doc):
# This shows a single target, though there may be multiple targets in a lang test
return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......
......@@ -43,10 +43,10 @@ class TriviaQA(Task):
return False
def training_docs(self):
return self.dataset['train']
return self.dataset["train"]
def validation_docs(self):
return self.dataset['validation']
return self.dataset["validation"]
def test_docs(self):
raise NotImplementedError()
......@@ -54,8 +54,14 @@ class TriviaQA(Task):
def doc_to_text(self, doc):
return f"Question: {doc['question']}\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc):
return " " + doc['answer']['value']
return " " + doc["answer"]["value"]
def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
......@@ -69,15 +75,13 @@ class TriviaQA(Task):
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc['answer']['aliases']):
for alias in self._remove_prefixes(doc["answer"]["aliases"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
def process_results(self, doc, results):
return {
"acc": float(any(results))
}
return {"acc": float(any(results))}
def aggregation(self):
return {
......@@ -85,6 +89,4 @@ class TriviaQA(Task):
}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
......@@ -80,22 +80,29 @@ class TruthfulQAMultipleChoice(Task):
raise NotImplementedError()
def doc_to_text(self, doc):
return QA_PROMPT + "\n\nQ: " + doc['question'] + "\nA:"
return QA_PROMPT + "\n\nQ: " + doc["question"] + "\nA:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc):
return " "
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -105,11 +112,15 @@ class TruthfulQAMultipleChoice(Task):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
def get_lls(targets):
return [rf.loglikelihood(ctx, " " + t)[0] for t in targets]
# MC1 and MC2 targets are not always the same set of strings so we collect
# likelihoods separately for simpler processing.
return get_lls(doc['mc1_targets']["choices"]) + get_lls(doc['mc2_targets']["choices"])
return get_lls(doc["mc1_targets"]["choices"]) + get_lls(
doc["mc2_targets"]["choices"]
)
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -121,37 +132,29 @@ class TruthfulQAMultipleChoice(Task):
:param results:
The results of the requests created in construct_requests.
"""
def mc1(lls):
# The gold answers in `mc1_targets` are always first (index = `0`).
return np.argmax(lls) == 0
def mc2(lls):
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc['mc2_targets']["labels"]).index(0)
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
return sum(p_true)
split_idx = len(doc['mc1_targets']["choices"])
split_idx = len(doc["mc1_targets"]["choices"])
mc1_lls, mc2_lls = results[:split_idx], results[split_idx:]
return {
"mc1": mc1(mc1_lls),
"mc2": mc2(mc2_lls)
}
return {"mc1": mc1(mc1_lls), "mc2": mc2(mc2_lls)}
def aggregation(self):
return {
"mc1": mean,
"mc2": mean
}
return {"mc1": mean, "mc2": mean}
def higher_is_better(self):
return {
"mc1": True,
"mc2": True
}
return {"mc1": True, "mc2": True}
class TruthfulQAGeneration(Task):
......@@ -181,44 +184,45 @@ class TruthfulQAGeneration(Task):
answer = answer.strip()
if len(answer):
# Add a period after all answers.
if answer[-1] != '.':
formatted_answers.append(answer + '.')
if answer[-1] != ".":
formatted_answers.append(answer + ".")
else:
formatted_answers.append(answer)
return formatted_answers
def validation_docs(self):
for doc in self.dataset["validation"]:
incorrect_answers = self._format_answers(doc['incorrect_answers'])
correct_answers = self._format_answers(doc['correct_answers'])
incorrect_answers = self._format_answers(doc["incorrect_answers"])
correct_answers = self._format_answers(doc["correct_answers"])
if "I have no comment." not in correct_answers:
correct_answers.append("I have no comment.")
yield {
'question': doc['question'].strip(),
'correct_answers': correct_answers,
'incorrect_answers': incorrect_answers
"question": doc["question"].strip(),
"correct_answers": correct_answers,
"incorrect_answers": incorrect_answers,
}
def test_docs(self):
raise NotImplementedError()
def doc_to_text(self, doc):
return QA_PROMPT + "\n\nQ: " + doc['question']
return QA_PROMPT + "\n\nQ: " + doc["question"]
def doc_to_target(self, doc):
return " "
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
......@@ -229,7 +233,7 @@ class TruthfulQAGeneration(Task):
part of the document for `doc`.
"""
# TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
completion = rf.greedy_until(ctx, ['.'])
completion = rf.greedy_until(ctx, ["."])
return completion
def process_results(self, doc, results):
......@@ -243,18 +247,18 @@ class TruthfulQAGeneration(Task):
The results of the requests created in construct_requests.
"""
completion = results[0].strip()
true_refs, false_refs = doc['correct_answers'], doc['incorrect_answers']
true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
all_refs = true_refs + false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# BLEURT
bleurt_scores_true = self.bleurt.compute(
predictions=[completion] * len(true_refs),
references=true_refs)['scores']
predictions=[completion] * len(true_refs), references=true_refs
)["scores"]
bleurt_scores_false = self.bleurt.compute(
predictions=[completion] * len(false_refs),
references=false_refs)['scores']
predictions=[completion] * len(false_refs), references=false_refs
)["scores"]
bleurt_correct = max(bleurt_scores_true)
bleurt_incorrect = max(bleurt_scores_false)
bleurt_max = bleurt_correct
......@@ -263,8 +267,8 @@ class TruthfulQAGeneration(Task):
# BLEU
bleu_scores = [self.bleu([[ref]], [completion]) for ref in all_refs]
bleu_correct = np.nanmax(bleu_scores[:len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs):])
bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
bleu_max = bleu_correct
bleu_diff = bleu_correct - bleu_incorrect
bleu_acc = int(bleu_correct > bleu_incorrect)
......@@ -272,23 +276,23 @@ class TruthfulQAGeneration(Task):
# ROUGE-N
rouge_scores = [self.rouge([ref], [completion]) for ref in all_refs]
# ROUGE-1
rouge1_scores = [score['rouge1'] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[:len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs):])
rouge1_scores = [score["rouge1"] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
rouge1_max = rouge1_correct
rouge1_diff = rouge1_correct - rouge1_incorrect
rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# ROUGE-2
rouge2_scores = [score['rouge2'] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[:len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs):])
rouge2_scores = [score["rouge2"] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
rouge2_max = rouge2_correct
rouge2_diff = rouge2_correct - rouge2_incorrect
rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# ROUGE-L
rougeL_scores = [score['rougeLsum'] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[:len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs):])
rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
rougeL_max = rougeL_correct
rougeL_diff = rougeL_correct - rougeL_incorrect
rougeL_acc = int(rougeL_correct > rougeL_incorrect)
......@@ -297,19 +301,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": bleurt_max,
"bleurt_acc": bleurt_acc,
"bleurt_diff": bleurt_diff,
"bleu_max": bleu_max,
"bleu_acc": bleu_acc,
"bleu_diff": bleu_diff,
"rouge1_max": rouge1_max,
"rouge1_acc": rouge1_acc,
"rouge1_diff": rouge1_diff,
"rouge2_max": rouge2_max,
"rouge2_acc": rouge2_acc,
"rouge2_diff": rouge2_diff,
"rougeL_max": rougeL_max,
"rougeL_acc": rougeL_acc,
"rougeL_diff": rougeL_diff,
......@@ -320,19 +320,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": mean,
"bleurt_acc": mean,
"bleurt_diff": mean,
"bleu_max": mean,
"bleu_acc": mean,
"bleu_diff": mean,
"rouge1_max": mean,
"rouge1_acc": mean,
"rouge1_diff": mean,
"rouge2_max": mean,
"rouge2_acc": mean,
"rouge2_diff": mean,
"rougeL_max": mean,
"rougeL_acc": mean,
"rougeL_diff": mean,
......@@ -343,19 +339,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": True,
"bleurt_acc": True,
"bleurt_diff": True,
"bleu_max": True,
"bleu_acc": True,
"bleu_diff": True,
"rouge1_max": True,
"rouge1_acc": True,
"rouge1_diff": True,
"rouge2_max": True,
"rouge2_acc": True,
"rouge2_diff": True,
"rougeL_max": True,
"rougeL_acc": True,
"rougeL_diff": True,
......@@ -379,7 +371,7 @@ class TruthfulQAGeneration(Task):
force=False,
lowercase=False,
tokenize="intl",
use_effective_order=False
use_effective_order=False,
).score
return score
......@@ -396,9 +388,11 @@ class TruthfulQAGeneration(Task):
rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n")
return summary
# Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds):
......@@ -406,4 +400,4 @@ class TruthfulQAGeneration(Task):
pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate()
return {type: result[type].mid.fmeasure*100 for type in rouge_types}
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
......@@ -49,6 +49,12 @@ class WordUnscrambleTask(Task):
def doc_to_text(self, doc):
return doc["context"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["context"]
def doc_to_target(self, doc):
return doc["completion"]
......@@ -59,19 +65,13 @@ class WordUnscrambleTask(Task):
def process_results(self, doc, results):
pred = results[0]
gold = doc["completion"]
return {
"acc": int(pred == gold)
}
return {"acc": int(pred == gold)}
def aggregation(self):
return {
"acc": mean
}
return {"acc": mean}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
class Anagrams1(WordUnscrambleTask):
......
......@@ -54,13 +54,19 @@ class WebQs(Task):
return self.dataset["test"]
def doc_to_text(self, doc):
return "Question: " + doc['question'] + '\nAnswer:'
return "Question: " + doc["question"] + "\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc):
# this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly
return " " + doc['answers'][0]
return " " + doc["answers"][0]
def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
......@@ -75,15 +81,13 @@ class WebQs(Task):
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc['answers']):
for alias in self._remove_prefixes(doc["answers"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
def process_results(self, doc, results):
return {
"acc": float(any(results))
}
return {"acc": float(any(results))}
def aggregation(self):
return {
......@@ -91,6 +95,4 @@ class WebQs(Task):
}
def higher_is_better(self):
return {
"acc": True
}
return {"acc": True}
......@@ -90,6 +90,9 @@ class WikiText(PerplexityTask):
def doc_to_target(self, doc):
return wikitext_detokenizer(doc)
def should_decontaminate(self):
return True
def count_words(self, doc):
# count number of words in *original doc before detokenization*
return len(re.split(r"\s+", doc))
......@@ -34,7 +34,7 @@ class Winogrande(Task):
DATASET_PATH = "winogrande"
DATASET_NAME = "winogrande_xl"
answer_to_num = {'1': 0, '2': 1}
answer_to_num = {"1": 0, "2": 1}
def has_training_docs(self):
return True
......@@ -56,6 +56,12 @@ class Winogrande(Task):
def doc_to_text(self, doc):
return self.partial_context(doc, doc["option" + doc["answer"]])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["sentence"]
@classmethod
def partial_context(cls, doc, option):
# Substitute the pronoun in the sentence with the specified option
......@@ -107,9 +113,7 @@ class Winogrande(Task):
:param results:
The results of the requests created in construct_requests.
"""
return {
"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]
}
return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]}
def aggregation(self):
"""
......@@ -117,9 +121,7 @@ class Winogrande(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
return {"acc": mean}
def higher_is_better(self):
"""
......@@ -127,6 +129,4 @@ class Winogrande(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
return {"acc": True}
......@@ -40,8 +40,19 @@ class WinogradSchemaChallenge273(Task):
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
upper_pronouns = [
"A",
"An",
"The",
"She",
"He",
"It",
"They",
"My",
"His",
"Her",
"Their",
]
def has_training_docs(self):
return False
......@@ -68,7 +79,7 @@ class WinogradSchemaChallenge273(Task):
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
......@@ -85,11 +96,17 @@ class WinogradSchemaChallenge273(Task):
def doc_to_text(self, doc):
return self.partial_context(doc, doc["options"][doc["label"]])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["text"]
@classmethod
def partial_context(cls, doc, option):
# Substitute the pronoun in the original text with the specified
# option and ignore everything after.
return doc["text"][:doc["pronoun_loc"]] + option
return doc["text"][: doc["pronoun_loc"]] + option
def doc_to_target(self, doc):
return self.partial_target(doc)
......@@ -135,9 +152,7 @@ class WinogradSchemaChallenge273(Task):
:param results:
The results of the requests created in construct_requests.
"""
return {
"acc": np.argmax(results) == doc["label"]
}
return {"acc": np.argmax(results) == doc["label"]}
def aggregation(self):
"""
......@@ -145,9 +160,7 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
return {"acc": mean}
def higher_is_better(self):
"""
......@@ -155,6 +168,4 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
return {"acc": True}
......@@ -34,6 +34,7 @@ def simple_parse_args_string(args_string):
args_dict[k] = v
return args_dict
def join_iters(iters):
for iter in iters:
yield from iter
......@@ -47,7 +48,9 @@ def chunks(iter, n):
yield arr
arr = []
if arr: yield arr
if arr:
yield arr
def group(arr, fn):
res = collections.defaultdict(list)
......@@ -57,12 +60,13 @@ def group(arr, fn):
return list(res.values())
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
string = string.replace("( ", "(")
string = string.replace("\" ", "\"")
string = string.replace(" \"", "\"")
string = string.replace('" ', '"')
string = string.replace(' "', '"')
string = re.sub(r" (['.,])", r"\1", string)
return string
......@@ -94,10 +98,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
# Special handling for first window: predict all tokens
first_seq_len = min(max_seq_len, len(token_list))
yield (
[prefix_token] + token_list[:first_seq_len - 1],
token_list[:first_seq_len]
)
yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
predicted += first_seq_len
while predicted < len(token_list):
......@@ -105,31 +106,30 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
window_end = predicted + window_pred_len
yield (
token_list[window_end - max_seq_len - 1:window_end - 1],
token_list[window_end - window_pred_len:window_end],
token_list[window_end - max_seq_len - 1 : window_end - 1],
token_list[window_end - window_pred_len : window_end],
)
predicted += window_pred_len
def make_disjoint_window(pair):
""" Takes output from get_rolling_token_windows and makes the context not overlap with the continuation """
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a, b = pair
return a[:-(len(b) - 1)], b
return a[: -(len(b) - 1)], b
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
arr = [
([y[0] for y in x], x[0][1]) for x in arr
]
arr = [([y[0] for y in x], x[0][1]) for x in arr]
arr.sort(key=lambda x: fn(x[1]))
self.arr = arr
def get_reordered(self):
return [x[1] for x in self.arr]
......@@ -146,20 +146,26 @@ class Reorderer:
return res
def positional_deprecated(fn):
"""
A decorator to nudge users into passing only keyword args (`kwargs`) to the
wrapped function, `fn`.
"""
@functools.wraps(fn)
def _wrapper(*args, **kwargs):
if len(args) != 1 if inspect.ismethod(fn) else 0:
print(f"WARNING: using {fn.__name__} with positional arguments is "
print(
f"WARNING: using {fn.__name__} with positional arguments is "
"deprecated and will be disallowed in a future version of "
"lm-evaluation-harness!")
"lm-evaluation-harness!"
)
return fn(*args, **kwargs)
return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
......@@ -169,12 +175,14 @@ def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / 'tests' / 'test_version_stable.py').exists():
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(f"Unable to find package root within {max_layers} upwards" +\
f"of {start_path}")
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
......@@ -182,9 +190,16 @@ def run_task_tests(task_list: List[str]):
Find the package root and run the tests for the given tasks
"""
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = ' or '.join(task_list)
args = [f'{package_root}/tests/test_version_stable.py', f'--rootdir={package_root}', '-k', f'{task_string}']
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}")
\ No newline at end of file
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
import argparse
import json
import logging
import fnmatch
from lm_eval import tasks, evaluator
logging.getLogger("openai").setLevel(logging.WARNING)
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--num_fewshot', type=int, default=0)
parser.add_argument('--batch_size', type=int, default=None)
parser.add_argument('--device', type=str, default=None)
parser.add_argument('--output_path', default=None)
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--no_cache', action="store_true")
parser.add_argument('--description_dict_path', default=None)
parser.add_argument('--check_integrity', action="store_true")
parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=None)
parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None)
parser.add_argument("--description_dict_path", default=None)
parser.add_argument("--check_integrity", action="store_true")
return parser.parse_args()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return list(task_names)
def main():
args = parse_args()
assert not args.provide_description # not implemented
if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
print(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if args.tasks == "all_tasks":
if args.tasks is None:
task_names = tasks.ALL_TASKS
else:
task_names = args.tasks.split(",")
task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}")
description_dict = {}
if args.description_dict_path:
with open(args.description_dict_path, 'r') as f:
with open(args.description_dict_path, "r") as f:
description_dict = json.load(f)
results = evaluator.simple_evaluate(
......@@ -51,11 +86,11 @@ def main():
no_cache=args.no_cache,
limit=args.limit,
description_dict=description_dict,
check_integrity=args.check_integrity
decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity,
)
dumped = json.dumps(results, indent=2)
print(dumped)
if args.output_path:
......
{
"Data": "Pile statistics",
"Document Count": 210607728,
"Total Pile Characters": 421215456,
"File Start Offsets": [
0,
7021438,
14042822,
21066113,
28086515,
35106072,
42123306,
49145091,
56165817,
63185587,
70211208,
77234322,
84249267,
91267634,
98285983,
105305110,
112322489,
119342491,
126367373,
133389153,
140412039,
147432373,
154452516,
161470190,
168492733,
175512521,
182526939,
189547478,
196565318,
203583306
]
}
......@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding `N`gram matches between the training data
and any contamination
1) `N`grams ignore case and punctation and are split on whitespace.
1) `N`grams ignore case and punctuation and are split on whitespace.
2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
......@@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10
```
## Compling
## Compiling
Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
......@@ -31,4 +31,3 @@ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor
```
If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
import glob
import argparse
import os
import subprocess
import shutil
from tqdm import tqdm
from tqdm_multiprocess import TqdmMultiProcessPool
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__)
def process_task(
working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
):
command = f"zstd {bucket_file_path}"
logger.info(command)
subprocess.call(command, shell=True)
compressed_file = bucket_file_path + ".zst"
if output_directory:
shutil.move(compressed_file, output_directory)
os.remove(bucket_file_path)
global_tqdm.update()
def compress_and_move(working_directory, output_directory, process_count):
os.makedirs(output_directory, exist_ok=True)
original_info_file_path = os.path.join(working_directory, "info.json")
assert os.path.exists(original_info_file_path)
tasks = []
bucket_file_paths = glob.glob(
os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
)
for bucket_file_path in bucket_file_paths:
task = (process_task, (working_directory, output_directory, bucket_file_path))
tasks.append(task)
pool = TqdmMultiProcessPool(process_count)
def on_done(_):
return None
def on_error(_):
return None
global_progress = tqdm(
total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
)
_ = pool.map(global_progress, tasks, on_error, on_done)
shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
parser = argparse.ArgumentParser(description="sort 13gram buckets")
parser.add_argument("-dir", "--working_directory", required=True)
parser.add_argument("-output", "--output_directory", required=True)
parser.add_argument("-procs", "--process_count", type=int, default=8)
if __name__ == "__main__":
version = 1.00
print(f"Running version {version}")
logfile_path = "compress_and_package.log"
setup_logger_tqdm(logfile_path)
args = parser.parse_args()
compress_and_move(args.working_directory, args.output_directory, args.process_count)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment