Unverified Commit e0cfeb90 authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Merge branch 'master' into researcher2

parents f9b81151 6caa0afd
import os
import json
import jsonlines
"""
TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
https://arxiv.org/pdf/1705.03551.pdf
TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
and independently gathered evidence documents, six per question on average, that provide
high quality distant supervision for answering the questions.
Homepage: https://nlp.cs.washington.edu/triviaqa/
"""
import inspect
import lm_eval.datasets.triviaqa.triviaqa
from lm_eval.base import Task, rf
from ..metrics import mean
from ..utils import sh
from best_download import download_file
from lm_eval.metrics import mean
_CITATION = """
@InProceedings{JoshiTriviaQA2017,
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2017},
address = {Vancouver, Canada},
publisher = {Association for Computational Linguistics},
}
"""
class TriviaQA(Task):
VERSION = 0
def download(self):
if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'):
os.makedirs("data/triviaqa/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz", local_file="data/triviaqa/triviaqa-unfiltered.tar.gz", expected_checksum="adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e")
sh("""
cd data/triviaqa/
tar -xf triviaqa-unfiltered.tar.gz
""")
DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
DATASET_NAME = None
def has_training_docs(self):
return True
......@@ -28,16 +43,16 @@ class TriviaQA(Task):
return False
def training_docs(self):
return jsonlines.open('data/triviaqa/unfiltered-web-train.jsonl')
return self.dataset['train']
def validation_docs(self):
return jsonlines.open('data/triviaqa/unfiltered-web-dev.jsonl')
return self.dataset['validation']
def test_docs(self):
raise NotImplementedError()
def doc_to_text(self, doc):
return f"Question: {doc['Question']}\nAnswer:"
return f"Question: {doc['question']}\nAnswer:"
def should_decontaminate(self):
return True
......@@ -46,7 +61,7 @@ class TriviaQA(Task):
return doc['Question'] + " " + doc['SearchResults']['Description']
def doc_to_target(self, doc):
return " " + doc['Answer']['Value']
return " " + doc['answer']['value']
def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
......@@ -56,12 +71,11 @@ class TriviaQA(Task):
for alias in aliases[1:]:
if not alias.startswith(ret[-1]):
ret.append(alias)
return ret
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc['Answer']['Aliases']):
for alias in self._remove_prefixes(doc['answer']['aliases']):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
......
......@@ -2,6 +2,13 @@
TruthfulQA: Measuring How Models Mimic Human Falsehoods
https://arxiv.org/pdf/2109.07958.pdf
TruthfulQA is a benchmark to measure whether a language model is truthful in
generating answers to questions. The benchmark comprises 817 questions that
span 38 categories, including health, law, finance and politics. Questions are
crafted so that some humans would answer falsely due to a false belief or
misconception. To perform well, models must avoid generating false answers
learned from imitating human texts.
TODO: Add support for the automatic metrics, 'GPT-judge' and 'GPT-info', which
predict human evaluation of truth and informativeness (respectively) through
a fine-tuned GPT-3 model. NOTE: This requires access keys to the corresponding
......@@ -10,25 +17,28 @@ provide the data used to fine-tune GPT-3 into `GPT-judge` and `GPT-info`, see
https://github.com/sylinrl/TruthfulQA#Fine-tuning-GPT-3-for-evaluation. Maybe
we could try this?
@misc{lin2021truthfulqa,
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
author={Stephanie Lin and Jacob Hilton and Owain Evans},
year={2021},
eprint={2109.07958},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
Homepage: https://github.com/sylinrl/TruthfulQA
"""
import csv
import json
import inspect
import numpy as np
import sacrebleu
import datasets
import lm_eval.datasets.truthfulqa.truthfulqa
from rouge_score import rouge_scorer, scoring
from lm_eval.base import rf, Task
from pathlib import Path
from best_download import download_file
from ..metrics import mean
from datasets import load_metric
from lm_eval.metrics import mean
_CITATION = """
@misc{lin2021truthfulqa,
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
author={Stephanie Lin and Jacob Hilton and Owain Evans},
year={2021},
eprint={2109.07958},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
# The default QA preset prompt for all models.
......@@ -50,15 +60,8 @@ QA_PROMPT = (
class TruthfulQAMultipleChoice(Task):
VERSION = 1
DATASET_PATH = Path('data/truthfulqa/mc')
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH, parents=True)
mc_url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json"
checksum = "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"
download_file(mc_url, local_file=str(self.DATASET_PATH / "mc_task.json"), expected_checksum=checksum)
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
DATASET_NAME = "multiple_choice"
def has_training_docs(self):
return False
......@@ -73,8 +76,7 @@ class TruthfulQAMultipleChoice(Task):
raise NotImplementedError()
def validation_docs(self):
with open(self.DATASET_PATH / "mc_task.json") as f:
return json.load(f)
return self.dataset["validation"]
def test_docs(self):
raise NotImplementedError()
......@@ -115,7 +117,7 @@ class TruthfulQAMultipleChoice(Task):
return [rf.loglikelihood(ctx, " " + t)[0] for t in targets]
# MC1 and MC2 targets are not always the same set of strings so we collect
# likelihoods separately for simpler processing.
return get_lls(doc['mc1_targets']) + get_lls(doc['mc2_targets'])
return get_lls(doc['mc1_targets']["choices"]) + get_lls(doc['mc2_targets']["choices"])
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -133,14 +135,14 @@ class TruthfulQAMultipleChoice(Task):
def mc2(lls):
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc['mc2_targets'].values()).index(0)
split_idx = list(doc['mc2_targets']["labels"]).index(0)
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
return sum(p_true)
split_idx = len(doc['mc1_targets'])
split_idx = len(doc['mc1_targets']["choices"])
mc1_lls, mc2_lls = results[:split_idx], results[split_idx:]
return {
"mc1": mc1(mc1_lls),
......@@ -162,19 +164,12 @@ class TruthfulQAMultipleChoice(Task):
class TruthfulQAGeneration(Task):
VERSION = 1
DATASET_PATH = Path('data/truthfulqa/generation')
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
DATASET_NAME = "generation"
def __init__(self):
super().__init__()
self.bleurt = load_metric("bleurt", cache_dir="lm_cache")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH, parents=True)
url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv"
checksum = "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"
download_file(url, local_file=str(self.DATASET_PATH / "TruthfulQA.csv"), expected_checksum=checksum)
self.bleurt = datasets.load_metric("bleurt")
def has_training_docs(self):
return False
......@@ -188,36 +183,29 @@ class TruthfulQAGeneration(Task):
def training_docs(self):
raise NotImplementedError()
def _split_multi_answer(self, answers, sep=';'):
answers = answers.strip().split(sep)
split_answers = []
def _format_answers(self, answers):
formatted_answers = []
for answer in answers:
answer = answer.strip()
if len(answer):
# Add a period after all answers.
if answer[-1] != '.':
split_answers.append(answer + '.')
formatted_answers.append(answer + '.')
else:
split_answers.append(answer)
return split_answers
formatted_answers.append(answer)
return formatted_answers
def validation_docs(self):
with open(self.DATASET_PATH / "TruthfulQA.csv", newline='') as csvfile:
doc_reader = csv.DictReader(csvfile)
for doc in doc_reader:
# Ensure that references exist.
if not doc['Correct Answers'] or not doc['Incorrect Answers']:
continue
correct_answers = self._split_multi_answer(doc['Correct Answers'])
if "I have no comment." not in correct_answers:
correct_answers.append("I have no comment.")
incorrect_answers = self._split_multi_answer(doc['Incorrect Answers'])
doc = {
'question': doc['Question'].strip(),
'correct_answers': correct_answers,
'incorrect_answers': incorrect_answers
}
yield doc
for doc in self.dataset["validation"]:
incorrect_answers = self._format_answers(doc['incorrect_answers'])
correct_answers = self._format_answers(doc['correct_answers'])
if "I have no comment." not in correct_answers:
correct_answers.append("I have no comment.")
yield {
'question': doc['question'].strip(),
'correct_answers': correct_answers,
'incorrect_answers': incorrect_answers
}
def test_docs(self):
raise NotImplementedError()
......
import gzip
import json
import shutil
from pathlib import Path
from best_download import download_file
"""
Language Models are Few-Shot Learners
https://arxiv.org/pdf/2005.14165.pdf
Unscramble is a small battery of 5 “character manipulation” tasks. Each task
involves giving the model a word distorted by some combination of scrambling,
addition, or deletion of characters, and asking it to recover the original word.
Homepage: https://github.com/openai/gpt-3/tree/master/data
"""
import inspect
import lm_eval.datasets.unscramble.unscramble
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
def extract_gzip(gz, to):
with gzip.open(gz, 'rb') as fin:
with open(to, 'wb') as fout:
shutil.copyfileobj(fin, fout)
_CITATION = """
@inproceedings{NEURIPS2020_1457c0d6,
author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {1877--1901},
publisher = {Curran Associates, Inc.},
title = {Language Models are Few-Shot Learners},
url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
volume = {33},
year = {2020}
}
"""
class WordUnscrambleTask(Task):
VERSION = 0
BASE_PATH = Path("data/unscramble")
FILENAME = None
CHECKSUM = None # SHA256 Checksum.
def __init__(self):
super().__init__()
def download(self):
if not self.BASE_PATH.exists():
Path.mkdir(self.BASE_PATH, parents=True)
file = self.BASE_PATH / self.FILENAME
if not file.exists():
rawfile = file.parent / (file.name + ".gz")
base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file(f"{base_url}/{self.FILENAME}.gz", local_file=str(rawfile), expected_checksum=self.CHECKSUM)
extract_gzip(gz=rawfile, to=file)
DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble)
DATASET_NAME = None
def has_training_docs(self):
return False
......@@ -42,8 +44,7 @@ class WordUnscrambleTask(Task):
return False
def validation_docs(self):
file = self.BASE_PATH / self.FILENAME
return (json.loads(line) for line in open(file).read().splitlines())
return self.dataset["validation"]
def doc_to_text(self, doc):
return doc["context"]
......@@ -80,25 +81,20 @@ class WordUnscrambleTask(Task):
class Anagrams1(WordUnscrambleTask):
FILENAME = "mid_word_1_anagrams.jsonl"
CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
DATASET_NAME = "mid_word_1_anagrams"
class Anagrams2(WordUnscrambleTask):
FILENAME = "mid_word_2_anagrams.jsonl"
CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
DATASET_NAME = "mid_word_2_anagrams"
class CycleLetters(WordUnscrambleTask):
FILENAME = "cycle_letters_in_word.jsonl"
CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
DATASET_NAME = "cycle_letters_in_word"
class RandomInsertion(WordUnscrambleTask):
FILENAME = "random_insertion_in_word.jsonl"
CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
DATASET_NAME = "random_insertion_in_word"
class ReversedWords(WordUnscrambleTask):
FILENAME = "reversed_words.jsonl"
CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
DATASET_NAME = "reversed_words"
from . common import HFTask
from lm_eval.base import rf
from ..metrics import mean
"""
Semantic Parsing on Freebase from Question-Answer Pairs
https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
WebQuestions is a benchmark for question answering. The dataset consists of 6,642
question/answer pairs. The questions are supposed to be answerable by Freebase, a
large knowledge graph. The questions are mostly centered around a single named entity.
The questions are popular ones asked on the web (at least in 2013).
class WebQs(HFTask):
Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
"""
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@inproceedings{berant-etal-2013-semantic,
title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
author = "Berant, Jonathan and
Chou, Andrew and
Frostig, Roy and
Liang, Percy",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
month = oct,
year = "2013",
address = "Seattle, Washington, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D13-1160",
pages = "1533--1544",
}
"""
class WebQs(Task):
VERSION = 0
DATASET_PATH = "web_questions"
DATASET_NAME = None
......@@ -17,6 +45,14 @@ class WebQs(HFTask):
def has_test_docs(self):
return True
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
return "Question: " + doc['question'] + '\nAnswer:'
......
import os
"""
Pointer Sentinel Mixture Models
https://arxiv.org/pdf/1609.07843.pdf
The WikiText language modeling dataset is a collection of over 100 million tokens
extracted from the set of verified Good and Featured articles on Wikipedia.
NOTE: This `Task` is based on WikiText-2.
Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
"""
import re
from lm_eval.base import rf, PerplexityTask
from lm_eval.utils import sh
import inspect
import lm_eval.datasets.wikitext.wikitext
from lm_eval.base import PerplexityTask
from best_download import download_file
_CITATION = """
@misc{merity2016pointer,
title={Pointer Sentinel Mixture Models},
author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
year={2016},
eprint={1609.07843},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
def wikitext_detokenizer(string):
......@@ -42,41 +63,29 @@ def wikitext_detokenizer(string):
class WikiText(PerplexityTask):
VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.wikitext.wikitext)
DATASET_NAME = "wikitext-2-raw-v1"
def download(self):
if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'):
os.makedirs("data/wikitext/", exist_ok=True)
download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", local_file="data/wikitext/wikitext-2-raw-v1.zip", expected_checksum="ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11")
sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip")
def has_validation_docs(self):
def has_training_docs(self):
return True
def has_train_docs(self):
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def docs_for_split(self, split):
ret = []
for line in open(f"data/wikitext/wikitext-2-raw/wiki.{split}.raw").read().split('\n'):
rline = line.replace("= = =", "===").replace("= =", "==").strip()
if rline.startswith('= ') and rline.strip().endswith(' ='):
s = '\n'.join(ret)
if s.strip(): yield s
ret = []
ret.append(line)
yield '\n'.join(ret)
def validation_docs(self):
return self.docs_for_split('valid')
def training_docs(self):
return map(self._load_doc, self.dataset["train"])
def train_docs(self):
return self.docs_for_split('train')
def validation_docs(self):
return map(self._load_doc, self.dataset["validation"])
def test_docs(self):
return self.docs_for_split('test')
return map(self._load_doc, self.dataset["test"])
def _load_doc(self, doc):
return doc["page"]
def doc_to_target(self, doc):
return wikitext_detokenizer(doc)
......@@ -86,7 +95,7 @@ class WikiText(PerplexityTask):
def doc_to_decontamination_query(self, doc):
return doc["text"]
def count_words(self, doc):
# count number of words in *original doc before detokenization*
return len(re.split(r"\s+", doc))
import numpy as np
from . common import HFTask
from lm_eval.base import rf
from ..metrics import mean
"""
WinoGrande: An Adversarial Winograd Schema Challenge at Scale
https://arxiv.org/pdf/1907.10641.pdf
WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
task with binary options, the goal is to choose the right option for a given
sentence which requires commonsense reasoning.
NOTE: This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
"""
This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
Reference: https://arxiv.org/abs/1806.02847
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@article{sakaguchi2019winogrande,
title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
journal={arXiv preprint arXiv:1907.10641},
year={2019}
}
"""
class Winogrande(HFTask):
class Winogrande(Task):
VERSION = 0
DATASET_PATH = "winogrande"
DATASET_NAME = "winogrande_xl"
......@@ -26,6 +45,14 @@ class Winogrande(HFTask):
def has_test_docs(self):
return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def doc_to_text(self, doc):
return self.partial_context(doc, doc["option" + doc["answer"]])
......
import numpy as np
import random
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
"""
The Winograd Schema Challenge
http://commonsensereasoning.org/2011/papers/Levesque.pdf
A Winograd schema is a pair of sentences that differ in only one or two words
and that contain an ambiguity that is resolved in opposite ways in the two
sentences and requires the use of world knowledge and reasoning for its resolution.
The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
See: https://arxiv.org/abs/1806.0
Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
title = "The winograd schema challenge",
abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
year = "2012",
language = "English (US)",
isbn = "9781577355601",
series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
}
"""
class WinogradSchemaChallenge273(HFTask):
class WinogradSchemaChallenge273(Task):
VERSION = 0
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
......@@ -19,19 +43,24 @@ class WinogradSchemaChallenge273(HFTask):
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def __clean_data(self):
def has_test_docs(self):
return True
def test_docs(self):
return map(self._load_doc, self.dataset["test"])
def _load_doc(self, doc):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data = []
for doc in self.data["test"]:
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
data.append(doc)
return {"test": data}
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
return doc
def __normalize_option(self, doc, option):
# Append `'s` to possessive determiner based options.
......@@ -44,15 +73,6 @@ class WinogradSchemaChallenge273(HFTask):
return option.replace(pronoun, pronoun.lower())
return option
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def fewshot_examples(self, k, rnd):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
......
......@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup(
name="lm_eval",
version="0.1.0",
version="0.2.0",
author="Leo Gao",
author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models",
......@@ -21,8 +21,7 @@ setuptools.setup(
python_requires='>=3.6',
install_requires=[
"black",
"best_download==0.0.9",
"datasets==1.15.1",
"datasets==2.0.0",
"click>=7.1",
"scikit-learn>=0.24.1",
"torch>=1.7",
......@@ -31,7 +30,6 @@ setuptools.setup(
"pytablewriter==0.58.0",
"sacrebleu==1.5.0",
"rouge-score==0.0.4",
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
"pycountry==20.7.3",
"numexpr==2.7.2",
"lm_dataformat==0.0.20",
......@@ -44,5 +42,9 @@ setuptools.setup(
"openai==0.6.4",
"jieba==0.42.1",
"nagisa==0.2.7",
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
],
dependency_links=[
"https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
]
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment