Merge branch 'master' into researcher2

e0cfeb90 · Jonathan Tow · GitHub · f9b81151 · 6caa0afd · e0cfeb90
Unverified Commit e0cfeb90 authored Apr 10, 2022 by Jonathan Tow Committed by GitHub Apr 10, 2022
8 changed files
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
-import os
-import json
-import jsonlines
+"""
+TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
+https://arxiv.org/pdf/1705.03551.pdf
+
+TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
+triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
+and independently gathered evidence documents, six per question on average, that provide
+high quality distant supervision for answering the questions.
+
+Homepage: https://nlp.cs.washington.edu/triviaqa/
+"""
+import inspect
+import lm_eval.datasets.triviaqa.triviaqa
 from lm_eval.base import Task, rf
-from ..metrics import mean
-from ..utils import sh
-from best_download import download_file
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@InProceedings{JoshiTriviaQA2017,
+    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+    month = {July},
+    year = {2017},
+    address = {Vancouver, Canada},
+    publisher = {Association for Computational Linguistics},
+}
+"""


 class TriviaQA(Task):
    VERSION = 0
-    def download(self):
-        if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'):
-            os.makedirs("data/triviaqa/", exist_ok=True)
-            download_file("http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz", local_file="data/triviaqa/triviaqa-unfiltered.tar.gz", expected_checksum="adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e")
-            sh("""
-            cd data/triviaqa/
-            tar -xf triviaqa-unfiltered.tar.gz
-            """)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -28,16 +43,16 @@ class TriviaQA(Task):
        return False

    def training_docs(self):
-        return jsonlines.open('data/triviaqa/unfiltered-web-train.jsonl')
+        return self.dataset['train']

    def validation_docs(self):
-        return jsonlines.open('data/triviaqa/unfiltered-web-dev.jsonl')
+        return self.dataset['validation']

    def test_docs(self):
        raise NotImplementedError()
-    
+
    def doc_to_text(self, doc):
-        return f"Question: {doc['Question']}\nAnswer:"
+        return f"Question: {doc['question']}\nAnswer:"

    def should_decontaminate(self):
        return True
@@ -46,7 +61,7 @@ class TriviaQA(Task):
        return doc['Question'] + " " + doc['SearchResults']['Description']

    def doc_to_target(self, doc):
-        return " " + doc['Answer']['Value']
+        return " " + doc['answer']['value']

    def _remove_prefixes(self, aliases):
        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
@@ -56,12 +71,11 @@ class TriviaQA(Task):
        for alias in aliases[1:]:
            if not alias.startswith(ret[-1]):
                ret.append(alias)
-
        return ret

    def construct_requests(self, doc, ctx):
        ret = []
-        for alias in self._remove_prefixes(doc['Answer']['Aliases']):
+        for alias in self._remove_prefixes(doc['answer']['aliases']):
            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
            ret.append(is_prediction)
        return ret

--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -2,6 +2,13 @@
 TruthfulQA: Measuring How Models Mimic Human Falsehoods
 https://arxiv.org/pdf/2109.07958.pdf

+TruthfulQA is a benchmark to measure whether a language model is truthful in
+generating answers to questions. The benchmark comprises 817 questions that
+span 38 categories, including health, law, finance and politics. Questions are
+crafted so that some humans would answer falsely due to a false belief or
+misconception. To perform well, models must avoid generating false answers
+learned from imitating human texts.
+
 TODO: Add support for the automatic metrics, 'GPT-judge' and 'GPT-info', which
 predict human evaluation of truth and informativeness (respectively) through
 a fine-tuned GPT-3 model. NOTE: This requires access keys to the corresponding
@@ -10,25 +17,28 @@ provide the data used to fine-tune GPT-3 into `GPT-judge` and `GPT-info`, see
 https://github.com/sylinrl/TruthfulQA#Fine-tuning-GPT-3-for-evaluation. Maybe
 we could try this?

-@misc{lin2021truthfulqa,
-      title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
-      author={Stephanie Lin and Jacob Hilton and Owain Evans},
-      year={2021},
-      eprint={2109.07958},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
+Homepage: https://github.com/sylinrl/TruthfulQA
 """
-import csv
-import json
+import inspect
 import numpy as np
 import sacrebleu
+import datasets
+import lm_eval.datasets.truthfulqa.truthfulqa
 from rouge_score import rouge_scorer, scoring
 from lm_eval.base import rf, Task
-from pathlib import Path
-from best_download import download_file
-from ..metrics import mean
-from datasets import load_metric
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@misc{lin2021truthfulqa,
+    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
+    author={Stephanie Lin and Jacob Hilton and Owain Evans},
+    year={2021},
+    eprint={2109.07958},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""


 # The default QA preset prompt for all models.
@@ -50,15 +60,8 @@ QA_PROMPT = (

 class TruthfulQAMultipleChoice(Task):
    VERSION = 1
-    DATASET_PATH = Path('data/truthfulqa/mc')
-
-    def download(self):
-        if self.DATASET_PATH.exists():
-            return
-        Path.mkdir(self.DATASET_PATH, parents=True)
-        mc_url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json"
-        checksum = "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"
-        download_file(mc_url, local_file=str(self.DATASET_PATH / "mc_task.json"), expected_checksum=checksum)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
+    DATASET_NAME = "multiple_choice"

    def has_training_docs(self):
        return False
@@ -73,8 +76,7 @@ class TruthfulQAMultipleChoice(Task):
        raise NotImplementedError()

    def validation_docs(self):
-        with open(self.DATASET_PATH / "mc_task.json") as f:
-            return json.load(f)
+        return self.dataset["validation"]

    def test_docs(self):
        raise NotImplementedError()
@@ -115,7 +117,7 @@ class TruthfulQAMultipleChoice(Task):
            return [rf.loglikelihood(ctx, " " + t)[0] for t in targets]
        # MC1 and MC2 targets are not always the same set of strings so we collect
        # likelihoods separately for simpler processing.
-        return get_lls(doc['mc1_targets']) + get_lls(doc['mc2_targets'])
+        return get_lls(doc['mc1_targets']["choices"]) + get_lls(doc['mc2_targets']["choices"])

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -133,14 +135,14 @@ class TruthfulQAMultipleChoice(Task):

        def mc2(lls):
            # Split on the first `0` as everything before it is true (`1`).
-            split_idx = list(doc['mc2_targets'].values()).index(0)
+            split_idx = list(doc['mc2_targets']["labels"]).index(0)
            # Compute the normalized probability mass for the correct answer.
            ll_true, ll_false = lls[:split_idx], lls[split_idx:]
            p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
            p_true = p_true / (sum(p_true) + sum(p_false))
            return sum(p_true)

-        split_idx = len(doc['mc1_targets'])
+        split_idx = len(doc['mc1_targets']["choices"])
        mc1_lls, mc2_lls = results[:split_idx], results[split_idx:]
        return {
            "mc1": mc1(mc1_lls),
@@ -162,19 +164,12 @@ class TruthfulQAMultipleChoice(Task):

 class TruthfulQAGeneration(Task):
    VERSION = 1
-    DATASET_PATH = Path('data/truthfulqa/generation')
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa)
+    DATASET_NAME = "generation"

    def __init__(self):
        super().__init__()
-        self.bleurt = load_metric("bleurt", cache_dir="lm_cache")
-
-    def download(self):
-        if self.DATASET_PATH.exists():
-            return
-        Path.mkdir(self.DATASET_PATH, parents=True)
-        url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv"
-        checksum = "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"
-        download_file(url, local_file=str(self.DATASET_PATH / "TruthfulQA.csv"), expected_checksum=checksum)
+        self.bleurt = datasets.load_metric("bleurt")

    def has_training_docs(self):
        return False
@@ -188,36 +183,29 @@ class TruthfulQAGeneration(Task):
    def training_docs(self):
        raise NotImplementedError()

-    def _split_multi_answer(self, answers, sep=';'):
-        answers = answers.strip().split(sep)
-        split_answers = []
+    def _format_answers(self, answers):
+        formatted_answers = []
        for answer in answers:
            answer = answer.strip()
            if len(answer):
                # Add a period after all answers.
                if answer[-1] != '.':
-                    split_answers.append(answer + '.')
+                    formatted_answers.append(answer + '.')
                else:
-                    split_answers.append(answer)
-        return split_answers
+                    formatted_answers.append(answer)
+        return formatted_answers

    def validation_docs(self):
-        with open(self.DATASET_PATH / "TruthfulQA.csv", newline='') as csvfile:
-            doc_reader = csv.DictReader(csvfile)
-            for doc in doc_reader:
-                # Ensure that references exist.
-                if not doc['Correct Answers'] or not doc['Incorrect Answers']:
-                    continue
-                correct_answers = self._split_multi_answer(doc['Correct Answers'])
-                if "I have no comment." not in correct_answers:
-                    correct_answers.append("I have no comment.")
-                incorrect_answers = self._split_multi_answer(doc['Incorrect Answers'])
-                doc = {
-                    'question': doc['Question'].strip(),
-                    'correct_answers': correct_answers,
-                    'incorrect_answers': incorrect_answers
-                }
-                yield doc
+        for doc in self.dataset["validation"]:
+            incorrect_answers = self._format_answers(doc['incorrect_answers'])
+            correct_answers = self._format_answers(doc['correct_answers'])
+            if "I have no comment." not in correct_answers:
+                correct_answers.append("I have no comment.")
+            yield {
+                'question': doc['question'].strip(),
+                'correct_answers': correct_answers,
+                'incorrect_answers': incorrect_answers
+            }

    def test_docs(self):
        raise NotImplementedError()

--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
-import gzip
-import json
-import shutil
-from pathlib import Path
-from best_download import download_file
+"""
+Language Models are Few-Shot Learners
+https://arxiv.org/pdf/2005.14165.pdf
+
+Unscramble is a small battery of 5 “character manipulation” tasks. Each task
+involves giving the model a word distorted by some combination of scrambling,
+addition, or deletion of characters, and asking it to recover the original word.
+
+Homepage: https://github.com/openai/gpt-3/tree/master/data
+"""
+import inspect
+import lm_eval.datasets.unscramble.unscramble
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean


-def extract_gzip(gz, to):
-    with gzip.open(gz, 'rb') as fin:
-        with open(to, 'wb') as fout:
-            shutil.copyfileobj(fin, fout)
+_CITATION = """
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+"""


 class WordUnscrambleTask(Task):
    VERSION = 0
-    BASE_PATH = Path("data/unscramble")
-    FILENAME = None
-    CHECKSUM = None  # SHA256 Checksum.
-
-    def __init__(self):
-        super().__init__()
-
-    def download(self):
-        if not self.BASE_PATH.exists():
-            Path.mkdir(self.BASE_PATH, parents=True)
-        file = self.BASE_PATH / self.FILENAME
-        if not file.exists():
-            rawfile = file.parent / (file.name + ".gz")
-            base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
-            download_file(f"{base_url}/{self.FILENAME}.gz", local_file=str(rawfile), expected_checksum=self.CHECKSUM)
-            extract_gzip(gz=rawfile, to=file)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble)
+    DATASET_NAME = None

    def has_training_docs(self):
        return False
@@ -42,8 +44,7 @@ class WordUnscrambleTask(Task):
        return False

    def validation_docs(self):
-        file = self.BASE_PATH / self.FILENAME
-        return (json.loads(line) for line in open(file).read().splitlines())
+        return self.dataset["validation"]

    def doc_to_text(self, doc):
        return doc["context"]
@@ -80,25 +81,20 @@ class WordUnscrambleTask(Task):


 class Anagrams1(WordUnscrambleTask):
-    FILENAME = "mid_word_1_anagrams.jsonl"
-    CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
+    DATASET_NAME = "mid_word_1_anagrams"


 class Anagrams2(WordUnscrambleTask):
-    FILENAME = "mid_word_2_anagrams.jsonl"
-    CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
+    DATASET_NAME = "mid_word_2_anagrams"


 class CycleLetters(WordUnscrambleTask):
-    FILENAME = "cycle_letters_in_word.jsonl"
-    CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
+    DATASET_NAME = "cycle_letters_in_word"


 class RandomInsertion(WordUnscrambleTask):
-    FILENAME = "random_insertion_in_word.jsonl"
-    CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
+    DATASET_NAME = "random_insertion_in_word"


 class ReversedWords(WordUnscrambleTask):
-    FILENAME = "reversed_words.jsonl"
-    CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
+    DATASET_NAME = "reversed_words"
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
-from . common import HFTask
-from lm_eval.base import rf
-from ..metrics import mean
+"""
+Semantic Parsing on Freebase from Question-Answer Pairs
+https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf

+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).

-class WebQs(HFTask):
+Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
+"""
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+"""
+
+
+class WebQs(Task):
    VERSION = 0
    DATASET_PATH = "web_questions"
    DATASET_NAME = None
@@ -17,6 +45,14 @@ class WebQs(HFTask):
    def has_test_docs(self):
        return True

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def test_docs(self):
+        return self.dataset["test"]
+
    def doc_to_text(self, doc):
        return "Question: " + doc['question'] + '\nAnswer:'


--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-import os
+"""
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+
+The WikiText language modeling dataset is a collection of over 100 million tokens 
+extracted from the set of verified Good and Featured articles on Wikipedia.
+
+NOTE: This `Task` is based on WikiText-2.
+
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+"""
 import re
-from lm_eval.base import rf, PerplexityTask
-from lm_eval.utils import sh
+import inspect
+import lm_eval.datasets.wikitext.wikitext
+from lm_eval.base import PerplexityTask

-from best_download import download_file
+
+_CITATION = """
+@misc{merity2016pointer,
+    title={Pointer Sentinel Mixture Models}, 
+    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+    year={2016},
+    eprint={1609.07843},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""


 def wikitext_detokenizer(string):
@@ -42,41 +63,29 @@ def wikitext_detokenizer(string):

 class WikiText(PerplexityTask):
    VERSION = 1
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.wikitext.wikitext)
+    DATASET_NAME = "wikitext-2-raw-v1"

-    def download(self):
-        if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'):
-            os.makedirs("data/wikitext/", exist_ok=True)
-            download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", local_file="data/wikitext/wikitext-2-raw-v1.zip", expected_checksum="ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11")
-            sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip")
-
-    def has_validation_docs(self):
+    def has_training_docs(self):
        return True

-    def has_train_docs(self):
+    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True
-    
-    def docs_for_split(self, split):
-        ret = []
-        for line in open(f"data/wikitext/wikitext-2-raw/wiki.{split}.raw").read().split('\n'):
-            rline = line.replace("= = =", "===").replace("= =", "==").strip()
-            if rline.startswith('= ') and rline.strip().endswith(' ='):
-                s = '\n'.join(ret)
-                if s.strip(): yield s
-                ret = []
-            ret.append(line)
-        yield '\n'.join(ret)

-    def validation_docs(self):
-        return self.docs_for_split('valid')
+    def training_docs(self):
+        return map(self._load_doc, self.dataset["train"])

-    def train_docs(self):
-        return self.docs_for_split('train')
+    def validation_docs(self):
+        return map(self._load_doc, self.dataset["validation"])

    def test_docs(self):
-        return self.docs_for_split('test')
+        return map(self._load_doc, self.dataset["test"])
+
+    def _load_doc(self, doc):
+        return doc["page"]

    def doc_to_target(self, doc):
        return wikitext_detokenizer(doc)
@@ -86,7 +95,7 @@ class WikiText(PerplexityTask):

    def doc_to_decontamination_query(self, doc):
        return doc["text"]
-    
+
    def count_words(self, doc):
        # count number of words in *original doc before detokenization*
        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
-import numpy as np
-from . common import HFTask
-from lm_eval.base import rf
-from ..metrics import mean
+"""
+WinoGrande: An Adversarial Winograd Schema Challenge at Scale
+https://arxiv.org/pdf/1907.10641.pdf
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.

+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018). 
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
 """
-This evaluation of Winogrande uses partial evaluation as described by
-Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-Reference: https://arxiv.org/abs/1806.02847
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
 """


-class Winogrande(HFTask):
+class Winogrande(Task):
    VERSION = 0
    DATASET_PATH = "winogrande"
    DATASET_NAME = "winogrande_xl"
@@ -26,6 +45,14 @@ class Winogrande(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return self.partial_context(doc, doc["option" + doc["answer"]])


--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
-import numpy as np
-import random
-from lm_eval.base import rf
-from ..metrics import mean
-from . common import HFTask
-
 """
+The Winograd Schema Challenge
+http://commonsensereasoning.org/2011/papers/Levesque.pdf
+
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+
 NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
 as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
-See: https://arxiv.org/abs/1806.02847
+See: https://arxiv.org/abs/1806.0
+
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
 """


-class WinogradSchemaChallenge273(HFTask):
+class WinogradSchemaChallenge273(Task):
    VERSION = 0
    DATASET_PATH = "winograd_wsc"
    DATASET_NAME = "wsc273"
@@ -19,19 +43,24 @@ class WinogradSchemaChallenge273(HFTask):
    upper_pronouns = ["A", "An", "The", "She", "He",
                      "It", "They", "My", "His", "Her", "Their"]

-    def __init__(self):
-        super().__init__()
-        self.data = self.__clean_data()
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False

-    def __clean_data(self):
+    def has_test_docs(self):
+        return True
+
+    def test_docs(self):
+        return map(self._load_doc, self.dataset["test"])
+
+    def _load_doc(self, doc):
        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
-        data = []
-        for doc in self.data["test"]:
-            doc["text"] = doc["text"].replace("  ", " ")
-            doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
-            doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
-            data.append(doc)
-        return {"test": data}
+        doc["text"] = doc["text"].replace("  ", " ")
+        doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
+        doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
+        return doc

    def __normalize_option(self, doc, option):
        # Append `'s` to possessive determiner based options.
@@ -44,15 +73,6 @@ class WinogradSchemaChallenge273(HFTask):
            return option.replace(pronoun, pronoun.lower())
        return option

-    def has_training_docs(self):
-        return False
-
-    def has_validation_docs(self):
-        return False
-
-    def has_test_docs(self):
-        return True
-
    def fewshot_examples(self, k, rnd):
        # NOTE: `super().fewshot_examples` samples from training docs which are
        # not available for this test-set-only dataset.

--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:

 setuptools.setup(
    name="lm_eval",
-    version="0.1.0",
+    version="0.2.0",
    author="Leo Gao",
    author_email="lg@eleuther.ai",
    description="A framework for evaluating autoregressive language models",
@@ -21,8 +21,7 @@ setuptools.setup(
    python_requires='>=3.6',
    install_requires=[
        "black",
-        "best_download==0.0.9",
-        "datasets==1.15.1",
+        "datasets==2.0.0",
        "click>=7.1",
        "scikit-learn>=0.24.1",
        "torch>=1.7",
@@ -31,7 +30,6 @@ setuptools.setup(
        "pytablewriter==0.58.0",
        "sacrebleu==1.5.0",
        "rouge-score==0.0.4",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
        "pycountry==20.7.3",
        "numexpr==2.7.2",
        "lm_dataformat==0.0.20",
@@ -44,5 +42,9 @@ setuptools.setup(
        "openai==0.6.4",
        "jieba==0.42.1",
        "nagisa==0.2.7",
+        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+    ],
+    dependency_links=[
+        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
    ]
 )