Merge pull request #4 from EleutherAI/master

Update cfoster0 fork

Merge pull request #4 from EleutherAI/master
Update cfoster0 fork
3d432b1a · Charles Foster · GitHub · 4a294d8a · 4d8ed7d5 · 3d432b1a
Unverified Commit 3d432b1a authored Feb 09, 2021 by Charles Foster Committed by GitHub Feb 09, 2021
20 changed files
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+import re
 import numpy as np
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from ..base import rf, mean
+from . common import HFTask
+

 class HellaSwag(HFTask):
    DATASET_PATH = "hellaswag"
    DATASET_NAME = None

+    @classmethod
+    def remove_brackets(cls, text):
+        """ Removes brackets from HellaSwag documents. 
+        NOTE: The brackets are artifacts of the WikiHow dataset portion underlying
+        HellaSwag.
+        """
+        text = re.sub('\[.*?\]', '', text)
+        return text
+
    def has_training_docs(self):
        return True

@@ -30,25 +39,78 @@ class HellaSwag(HFTask):
            return self.data["test"]

    def fewshot_description(self):
-        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
+        return "Label for the relevant action: Sentences describing the " \
+            "context, with an incomplete sentence trailing\nanswer that " \
+            "plausibly completes the situation."

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
-        if include_target:
-            letter_answer = doc['label']
-            if letter_answer == '0':
-                index = 0
-            elif letter_answer == '1':
-                index = 1
-            elif letter_answer == '2':
-                index = 2
-            elif letter_answer == '3':
-                index = 3
-            else:
-                raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
-            text += doc['endings'][index]
-        return text
+        return self.remove_brackets(text)
+
+    def doc_to_target(self, doc):
+        letter_answer = doc['label']
+        if letter_answer == '0':
+            index = 0
+        elif letter_answer == '1':
+            index = 1
+        elif letter_answer == '2':
+            index = 2
+        elif letter_answer == '3':
+            index = 3
+        else:
+            raise ValueError(
+                "HellaSwag from HF datasets contained an invalid answer key")
+        target = doc['endings'][index]
+        return " " + self.remove_brackets(target)
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_answers = []
+        for i in range(4):
+            continuation = " " + self.remove_brackets(doc['endings'][i])
+            ll_answers.append(rf.loglikelihood(ctx, continuation))
+        return ll_answers
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = int(doc['label'])
+        pred = np.argmax(results)
+        acc = 1. if pred == gold else 0.
+        return {
+            "acc": acc
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "acc": mean
+        }

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "acc": True
+        }
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
-from lm_eval.base import Dataset
+from lm_eval.base import Task, rf, mean, perplexity
 from lm_eval.utils import sh
 import json
-import requests
-import ftfy
+import math
+from best_download import download_file


-class Lambada(Dataset):
-    def __init__(self):
-        self.download()
+class LAMBADA(Task):
    def download(self):
        sh("mkdir -p data/lambada")
-        with open("data/lambada/lambada_test.json", 'w') as f:
-            req = requests.get("https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl")
-            req.raise_for_status()
-            jsons = [json.loads(l) for l in req.iter_lines()]
-            texts = [ftfy.fix_text(j['text'], normalization='NFKC') for j in jsons]
-            json.dump(texts, f)
+        download_file(
+            "https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl", 
+            "data/lambada/lambada_test.jsonl", 
+            "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
+        )

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
-        return False
+        return True

    def has_test_docs(self):
-        return True
+        return False

    def training_docs(self):
        pass

    def validation_docs(self):
+        with open("data/lambada/lambada_test.jsonl") as fh:
+            for line in fh:
+                yield json.loads(line)
+
+    def test_docs(self):
        pass

-    def load_doc(self, myjson):
-        return [doc for doc in myjson]
+    def doc_to_text(self, doc):
+        return doc['text'].rsplit(' ', 1)[0]

-    def test_docs(self):
-        myjson = json.load(open("data/lambada/lambada_test.json"))
-        return self.load_doc(myjson)
+    def doc_to_target(self, doc):
+        return " " + doc['text'].rsplit(' ', 1)[1]
+    
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+
+        return ll, is_greedy
+    
+    def process_results(self, doc, results):
+        ll, is_greedy = results

-    def doc_to_text(self, doc, include_target=True):
-        #TODO: check if this is how OA does it
-        #label = doc[]
-        return doc
+        return {
+            'ppl': ll,
+            'acc': int(is_greedy)
+        }
+        
+    def aggregation(self):
+        return {
+            'ppl': perplexity,
+            'acc': mean
+        }

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
\ No newline at end of file
+    def higher_is_better(self):
+        return {
+            'ppl': False,
+            'acc': True
+        }
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
 from . common import HFTask
+from itertools import islice
+import random

 class NaturalQs(HFTask):
+    # TODO: naturalqs has a *really* large train set that huggingface just
+    # automatically downloads even if you dont use it. we should try and only 
+    # download the val set and not even bother with the train set. 
+
    DATASET_PATH = "natural_questions"
    DATASET_NAME = None

@@ -22,24 +28,68 @@ class NaturalQs(HFTask):
        # Data is too large to fit in memory.
        return self.data["train"]

-    def doc_to_text(self, doc, include_target=True):
-        question = doc['question']['text']
-        
-        text = 'Q: ' + question + '\n\n' + 'A: '
-
-        if include_target:
-            # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
-            short_answer = doc['annotations']['short_answers'][0]['text']
-            long_answer_start = doc['annotations']['long_answer'][0]['start_token']
-            long_answer_end = doc['annotations']['long_answer'][0]['end_token']
-            long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
-            long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
-            long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
-            long_answer = " ".join(long_answer_chars)
-            text += long_answer # Replace with short_answer[0] for short answer
-
-        return text
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+    def fewshot_examples(self, k):
+        # Data is too large to fit in memory. We just sample from the first bit.
+        if self._training_docs is None:
+            self._training_docs = list(islice(self.training_docs(), 0, 100000))
+
+        return random.sample(self._training_docs, k)
+
+    def doc_to_text(self, doc):
+        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
+        short_answer = doc['annotations']['short_answers'][0]['text']
+        long_answer_start = doc['annotations']['long_answer'][0]['start_token']
+        long_answer_end = doc['annotations']['long_answer'][0]['end_token']
+        long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
+        long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
+        long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
+        long_answer = " ".join(long_answer_chars)
+        return long_answer # Replace with short_answer[0] for short answer
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
-import numpy as np
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from lm_eval.base import MultipleChoiceTask
+from .common import HFTask

-class OpenBookQA(HFTask):
+
+class OpenBookQA(HFTask, MultipleChoiceTask):
    DATASET_PATH = "openbookqa"
    DATASET_NAME = "main"

@@ -17,40 +15,34 @@ class OpenBookQA(HFTask):
    def has_test_docs(self):
        return True

+    def _convert_standard(self, doc):
+        out_doc = {
+            "id": doc["id"],
+            "query": doc["question_stem"],
+            "choices": doc["choices"]["text"],
+            "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
+        }
+        return out_doc
+
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
+
    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.data["train"])
-            return self._training_docs
+        docs = super().training_docs()
+        return self._load_docs(docs)

    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.data["validation"]
+        docs = super().validation_docs()
+        return self._load_docs(docs)

    def test_docs(self):
-        if self.has_test_docs():
-            return self.data["test"]
+        docs = super().test_docs()
+        return self._load_docs(docs)

    def fewshot_description(self):
-        return "Text of the question prompt\nText of the answer completion"
-
-    def doc_to_text(self, doc, include_target=True):
-        text = doc['question_stem'] + '\n'
-        if include_target:
-            letter_answer = doc['answerKey']
-            if letter_answer == 'A':
-                index = 0
-            elif letter_answer == 'B':
-                index = 1
-            elif letter_answer == 'C':
-                index = 2
-            elif letter_answer == 'D':
-                index = 3
-            else:
-                raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
-            text += doc['choices']['text'][index] + '.'
-        return text
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+        # TODO: figure out fewshot description
+        return ""
+
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
-import json
-import random
-from lm_eval.base import Dataset
-from ..utils import sh
-
-class PiQA(Dataset):
-    def __init__(self):
-        self.download()
-    def download(self):
-        #pass
-        #TODO: don't download if files already there
-        sh("""
-           mkdir -p data/piqa
-           wget https://yonatanbisk.com/piqa/data/train.jsonl -O data/piqa/piqa-train.jsonl
-           wget https://yonatanbisk.com/piqa/data/train-labels.lst -O data/piqa/piqa-train-labels.lst
-           wget https://yonatanbisk.com/piqa/data/valid.jsonl -O data/piqa/piqa-valid.jsonl
-           wget https://yonatanbisk.com/piqa/data/valid-labels.lst -O data/piqa/piqa-valid-labels.lst
-           wget https://yonatanbisk.com/piqa/data/tests.jsonl -O data/piqa/piqa-test.jsonl
-           """)
+import numpy as np
+from lm_eval.base import rf, mean
+from . common import HFTask
+
+
+class PiQA(HFTask):
+    DATASET_PATH = "piqa"
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -25,33 +14,35 @@ class PiQA(Dataset):
        return True

    def has_test_docs(self):
-        return True
+        return False

-    def load_docs(self, textfilename, labelfilename):
-        if labelfilename != None:
-            return zip([json.loads(entry) for entry in list(open(textfilename,'r'))],list(open(labelfilename, 'r')))
-        else:
-            return [json.loads(entry) for entry in list(open(textfilename,'r'))]
-    
-    def training_docs(self):
-        return self.load_docs('data/piqa/piqa-train.jsonl', 'data/piqa/piqa-train-labels.lst')
-   
-    def validation_docs(self):
-        return self.load_docs('data/piqa/piqa-valid.jsonl', 'data/piqa/piqa-valid-labels.lst')
-
-    def test_docs(self):
-        return self.load_docs('data/piqa/piqa-test.jsonl', None)
-    
    def fewshot_description(self):
-        pass
-    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            rightanswer = int(doc[1][0])+1
-            return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
-        #TODO: check if oa uses newline
-        return  doc['goal'] + ' '
-
-    def evaluate(self, docs, lm):
-        pass
-
+        # TODO: figure out fewshot description
+        return ""
+
+    def doc_to_text(self, doc):
+        return "Question: "+doc["goal"] + "\nAnswer:"
+
+    def doc_to_target(self, doc):
+        solutions = [doc["sol1"], doc["sol2"]]
+        return " " + solutions[doc["label"]]
+
+    def construct_requests(self, doc, ctx):
+        ll_1, _ = rf.loglikelihood(ctx, " " + doc['sol1'])
+        ll_2, _ = rf.loglikelihood(ctx, " " + doc['sol2'])
+        return ll_1, ll_2
+
+    def process_results(self, doc, results):
+        return {
+            'acc': np.argmax(results) == doc["label"]
+        }
+
+    def aggregation(self):
+        return {
+            'acc': mean
+        }
+
+    def higher_is_better(self):
+        return {
+            'acc': True
+        }
--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
+import numpy as np
+import json
+import random
+from .common import HFTask 
+from lm_eval.base import rf, mean
+
+
+class Pubmed_QA(HFTask):
+    DATASET_PATH = "pubmed_qa"
+    DATASET_NAME = "pqa_labeled"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def test_docs(self):
+        if self.has_test_docs():
+            # HF is labelled as train but its really just for testing
+            return self.data["train"]
+
+    def fewshot_description(self):
+        # Average ctx length in labelled dataset is 238.9
+        # 2 few-shot exmamples pushes it beyond context window
+        return ""
+
+    def doc_to_text(self, doc):
+        ctxs = "\n".join(doc["context"]["contexts"])
+        return "Abstract: {}\nQuestion: {}\nAnswer:".format(
+            ctxs,
+            doc["question"],
+            doc["final_decision"]
+        )
+
+    def doc_to_target(self, doc):
+        return " {}".format(doc["final_decision"])
+
+    def fewshot_examples(self, k):
+        # Since only test docs sample from test docs
+        if self._training_docs is None:
+            self._training_docs = list(self.test_docs())
+        return random.sample(self._training_docs, k)
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns
+        an iterable of Requests which will be sent to the LM.
+        """
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        ll_maybe, _ = rf.loglikelihood(ctx, " maybe")
+        return ll_yes, ll_no, ll_maybe
+
+    def process_results(self, doc, results):
+        gold = doc["final_decision"]
+        ll_yes, ll_no, ll_maybe = results
+        pred = np.argmax(results)
+        return {
+            "acc": ["yes", "no", "maybe"][pred] == gold, 
+        }
+
+    def aggregation(self):
+        return {
+            "acc" : mean
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc" : True
+        }
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
 import json
 import random
 import os
-from lm_eval.base import Dataset
+from lm_eval.base import Task
 from ..utils import sh


-class QuAC(Dataset):    
+class QuAC(Task):    
    def __init__(self):
        super().__init__()

@@ -37,13 +37,8 @@ class QuAC(Dataset):
    def test_docs(self):
        raise NotImplementedError("QuAC has no test docs.")
    
-    def fewshot_examples(self, k):
-        traindocs = list(self.training_docs())
-        random.shuffle(traindocs)
-
-        return traindocs[:k]
-    
    def fewshot_description(self):
+        # TODO: figure out fewshot description
        desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
        return desc

@@ -59,11 +54,53 @@ class QuAC(Dataset):
                docs.append(doc)  
        return docs
    
-    def doc_to_text(self, doc, include_target=True):
-        text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
-        if include_target:
-            text += doc['answer']
-        return text
-
-    def evaluate(self, docs, lm):
-        pass
+    def doc_to_text(self, doc):
+        return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        return doc['answer']
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
-from . common import HFTask
-from ..utils_stream import X, each, apply, join, filt, one
 import collections
 import datasets
+import numpy as np
+from lm_eval.base import rf, mean
+from . common import HFTask
+
+import os
+from functools import reduce
+import operator
+from tqdm import tqdm
+import json
+
+class each:
+    def __init__(self, f):
+        self.f = f
+
+    def __rrshift__(self, other):
+        return list(map(self.f, other))


 class RACE(HFTask):
@@ -9,6 +23,7 @@ class RACE(HFTask):
    DATASET_NAME = "high"

    cache = {}
+    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

    def has_training_docs(self):
        return True
@@ -20,7 +35,8 @@ class RACE(HFTask):
        return True

    def _collate_data(self, set):
-        if set in self.cache: return self.cache[set]
+        if set in self.cache:
+            return self.cache[set]
        # One big issue with HF's implementation of this dataset: it makes a
        # separate document for each question; meanwhile, in the GPT3 paper it
        # is shown that one document is made per passage.
@@ -54,17 +70,80 @@ class RACE(HFTask):
        # TODO: figure out description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        r = "Article:\n" + doc['article'] + '\n\n'
-
-        r += doc['problems'] >> apply(enumerate) >> each(
-            lambda x: 'Q: ' + x[1]['question'] + '\n\nA:' 
-            + ((' ' + x[1]['options'][['A', 'B', 'C', 'D'].index(x[1]['answer'])]) \
-                if x[0] != len(doc['problems']) - 1 or include_target else '')) \
-            >> join('\n\n')
-
-        return r
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+    @classmethod
+    def get_answer_option(cls, problem):
+        answer = cls.letter_to_num[problem['answer']]
+        return problem['options'][answer]
+
+    @classmethod
+    def last_problem(cls, doc):
+        return doc['problems'][-1]
+
+    def doc_to_text(self, doc):
+        text = 'Article: ' + doc['article'] + '\n\n'
+        for problem in doc['problems'][:-1]:
+            if problem['question'][-6:] == '  _  .':
+                text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
+            else:
+                question = 'Question: ' + problem['question'] + '\n'
+                answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
+                text += question + answer
+        text += self.last_problem(doc)['question']
+        return text
+
+    def doc_to_target(self, doc):
+        return " " + self.get_answer_option(self.last_problem(doc))
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        problem = self.last_problem(doc)
+        ll_choices = [
+            rf.loglikelihood(ctx, " " + problem['options'][i])[0]
+            for i in range(4)
+        ]
+        return ll_choices
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = self.letter_to_num[self.last_problem(doc)['answer']]
+        pred = np.argmax(results)
+        return {
+            "acc": int(pred == gold)
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        return {
+            "acc": mean
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        return {
+            "acc": True
+        }
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
+import json
+import random
+import os
+from lm_eval.base import MultipleChoiceTask, rf, mean
+from tqdm import auto as tqdm_lib
+from . common import simple_accuracy_metric
+import numpy as np
+from ..utils import sh
+
+
+class SATAnalogies(MultipleChoiceTask):    
+    NEEDS_MANUAL_DL = True
+    
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        # We should be using a checksum here.
+        # The canonical sha256 hash is below:
+        # 9dece377d8d57253ef8c78370ff15de0bb1d9e90a82c815a67ba1e621e921bfc
+
+        if not os.path.exists('data/sat/SAT-package-V3.txt'):
+            raise NotImplementedError('SAT Analogies dataset is not provided. Follow instructions on https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) to locate.')
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        return []
+    def test_docs(self):
+        return []
+
+    def validation_docs(self):
+        data = []
+
+        with open("data/sat/SAT-package-V3.txt", "r") as f:
+            record = []
+            for line in f:
+                line = line.strip()
+                if len(line) == 0 and record:
+                    data.append(record)
+                    record = []
+                elif len(line) > 0 and line[0] == '#':
+                    continue
+                else:
+                    record.append(line)
+            data.append(record)
+
+        for record in data:
+            source = record[-8]
+            query = record[-7]
+            choices = record[-6:-1]
+            answer_key = record[-1]
+
+            doc = {
+                'source': source,
+                'query': query.split(' ')[:2],
+                'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in choices],
+                'gold': ['a','b','c','d','e'].index(answer_key.strip()),
+            }
+            yield doc
+
+    
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return ""
+
+    def doc_to_text(self, doc):
+        return "{} is to {} as".format(*doc['query'])
--- a/lm_eval/tasks/sciq.py
+++ b/lm_eval/tasks/sciq.py
+import os
+import json
+from ..utils import sh
+from lm_eval.base import MultipleChoiceTask, rf, mean
+import zipfile
+
+
+class SciQ(MultipleChoiceTask):
+    # Multiple languages and multiple years
+    def download(self):
+        if not os.path.exists('data/sciq'):
+            os.mkdir('data/sciq')
+            sh((
+                "wget https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip -O data/sciq/SciQ.zip"
+            ))
+            with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
+                zf.extractall("data/sciq/")
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def _convert_standard(self, doc):
+        choices = [
+            doc["distractor1"], 
+            doc["distractor2"], 
+            doc["distractor3"],
+            doc["correct_answer"],
+        ]
+        src = doc['support']
+        out_doc = {
+            "source" : src,
+            "query" : doc['question'],
+            "choices" : choices,
+            "gold" : 3,
+        }
+        return out_doc
+    
+    def load_docs(self, textfilename):
+        with open(textfilename, 'r') as j:
+            docs = json.loads(j.read()) 
+        for record in docs:
+            yield self._convert_standard(record)
+
+    def fewshot_description(self):
+        # Average ctx length in labelled dataset is 238.9
+        # 2 few-shot exmamples pushes it beyond context window
+        return ""
+
+    def training_docs(self):
+        return self.load_docs("data/sciq/SciQ dataset-2 3/train.json")
+
+    def validation_docs(self):
+        return self.load_docs("data/sciq/SciQ dataset-2 3/valid.json")
+
+    def test_docs(self):
+        return self.load_docs("data/sciq/SciQ dataset-2 3/test.json")
+
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
\ No newline at end of file
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -26,19 +26,61 @@ class SQuAD(HFTask):
            return self.data["validation"]

    def fewshot_description(self):
+        # TODO: redo description
        return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."

-    def doc_to_text(self, doc, include_target=True):
-        text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
-        if include_target:
-            answer_list = doc['answers']['text']
-            if len(answer_list) > 0:
-                answer = answer_list[0]
-            else:
-                answer = 'unanswerable'
-            text += answer
-        return text
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
\ No newline at end of file
+    def doc_to_text(self, doc):
+        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+
+    def doc_to_target(self, doc):
+        answer_list = doc['answers']['text']
+        if len(answer_list) > 0:
+            answer = answer_list[0]
+        else:
+            answer = 'unanswerable'
+        return answer
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
 import json
 import random
-from lm_eval.base import Dataset
+from lm_eval.base import Task
 from ..utils import sh
 import csv

-class StoryCloze(Dataset):
-    def __init__(self):
-        self.download()
+class StoryCloze(Task):
+    NEEDS_MANUAL_DL = True
+
    def download(self):
        #TODO: replace with Eye link
        pass
@@ -30,21 +30,63 @@ class StoryCloze(Dataset):
                

    def validation_docs(self):
-        return  self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
+        return self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")

    def test_docs(self):
        return self.load_doc("data/storycloze/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv")

    
    def fewshot_description(self):
-        pass
+        # TODO: figure out fewshot description
+        return ""
    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            return ' '.join([*doc[1:5],doc[int(doc[-1])-4]]) 
-        else:
-            return ' '.join([*doc[1:5]])
+    def doc_to_text(self, doc):
+        return ' '.join([*doc[1:5]])

-    def evaluate(self, docs, lm):
-        pass
+    def doc_to_target(self, doc):
+        return " " + doc[int(doc[-1]) - 4]
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
+"""
+To-do:
+    - WSC requires free-form generation
+    - ReCoRD
+"""
 import numpy as np
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from . common import HFTask, yesno
+from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
+import sklearn
+import transformers.data.metrics.squad_metrics as squad_metrics
+from ..utils import general_detokenize


 class BoolQ(HFTask):
@@ -17,23 +25,41 @@ class BoolQ(HFTask):
        return True

    def fewshot_description(self):
+        # TODO: figure out actual description
        return "Read the following passages and answer each question with a yes or a no."

-    def doc_to_text(self, doc, include_target=True):
-        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
-            + (yesno(doc['label']) if include_target else "")
+    def doc_to_text(self, doc):
+        return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
+    
+    def doc_to_target(self, doc):
+        return " " + yesno(doc['label']) 

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in docs:
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def construct_requests(self, doc, ctx):
+
+        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
+        ll_no, _ = rf.loglikelihood(ctx, ' no')
+
+        return ll_yes, ll_no
+
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+
+        acc = 1. if (ll_yes > ll_no) == gold else 0.
+
+        return {
+            "acc": acc
+        }
+    
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    
+    def aggregation(self):
+        return {
+            "acc": mean
+        }


 class CommitmentBank(HFTask):
@@ -49,34 +75,62 @@ class CommitmentBank(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
+            "to the truth of the hypothesis. The three possible labels are true, false or neither."
+
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )
-        if include_target:
-            # True = entailment
-            # False = contradiction
-            # Neither = neutral
-            text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
-        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            probs = np.array([
-                lm.loglikelihood(ctx, ' true'),
-                lm.loglikelihood(ctx, ' neither'),
-                lm.loglikelihood(ctx, ' false'),
-            ])
-            preds.append(np.argmax(probs))
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, ' True')
+        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
+        ll_false, _ = rf.loglikelihood(ctx, ' False')
+
+        return ll_true, ll_neither, ll_false
+
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        pred = np.argmax(results)
+        acc = 1. if pred == gold else 0.
+
+        return {
+            "acc": acc,
+            "f1": (pred, gold)
+        }
+    
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "f1": True
+        }
+
+    @classmethod
+    def cb_multi_fi(cls, items):
+        preds, golds = zip(*items)
+        preds = np.array(preds)
+        golds = np.array(golds)
+        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
+        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
+        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
+        avg_f1 = mean([f11, f12, f13])
+        return avg_f1
+    
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": self.cb_multi_fi,
+        }


 class Copa(HFTask):
@@ -92,32 +146,51 @@ class Copa(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return "Given a premise and one alternative with a causal relation to the premise and another without," \
+            "choose the more plausible alternative"
+
+    def doc_to_text(self, doc):
        # Drop the period
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
-        text = doc["premise"].strip()[:-1] + f" {connector} "
-        if include_target:
-            correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
-            # Connect the sentences
-            text += self.convert_choice(correct_choice)
-        return text
+        return doc["premise"].strip()[:-1] + f" {connector}"
+
+    def doc_to_target(self, doc):
+        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+        # Connect the sentences
+        return " " + self.convert_choice(correct_choice)
+
+    def construct_requests(self, doc, ctx):
+        choice1 = " " + self.convert_choice(doc["choice1"])
+        choice2 = " " + self.convert_choice(doc["choice2"])
+        
+        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
+        ll_choice2, _ = rf.loglikelihood(ctx, choice2)
+
+        return ll_choice1, ll_choice2

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            choice1 = " " + self.convert_choice(doc["choice1"])
-            choice2 = " " + self.convert_choice(doc["choice2"])
-            preds.append(lm.loglikelihood(ctx, choice2) > lm.loglikelihood(ctx, choice1))
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        pred = np.argmax(results)
+        acc = 1. if pred == gold else 0.
+
+        return {
+            "acc": acc
+        }
+    
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    
+    def aggregation(self):
+        return {
+            "acc": mean
+        }

    @staticmethod
    def convert_choice(choice):
@@ -138,46 +211,139 @@ class MultiRC(HFTask):
        return True

    def fewshot_description(self):
+        # TODO: figure out actual description
        return "READING COMPREHENSION ANSWER KEY"

-    def doc_to_text(self, doc, include_target=True):
-        return f"{doc['paragraph']}\n\n{doc['question']}\n" \
-            + (self.format_answer(answer=doc["answer"], label=doc["label"])
-               if include_target else "")
+    def doc_to_text(self, doc):
+        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
+
+    def doc_to_target(self, doc):
+        return self.format_answer(answer=doc["answer"], label=doc["label"])

    @staticmethod
    def format_answer(answer, label):
-        label_str = "True" if label else "False"
-        return f"[{label_str}] {answer}"
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        preds = []
-        for doc in docs:
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            true_choice = self.format_answer(answer=doc["answer"], label=True)
-            false_choice = self.format_answer(answer=doc["answer"], label=False)
-            preds.append(
-                lm.loglikelihood(ctx, f' {true_choice}')
-                > lm.loglikelihood(ctx, f' {false_choice}')
-            )
-
-        # Only count as correct if all answers are labeled correctly for each question
-        question_scoring_dict = {}
-        for doc, pred in zip(docs, preds):
-            question_id = doc["idx"]["question"]
-            if question_id not in question_scoring_dict:
-                question_scoring_dict[question_id] = []
-            gold_label = doc["label"] == 1
-            question_scoring_dict[question_id].append(gold_label == pred)
-        acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+        label_str = "yes" if label else "no"
+        return f"{label_str}, {answer}"
+
+    def construct_requests(self, doc, ctx):
+        true_choice = self.format_answer(answer=doc["answer"], label=True)
+        false_choice = self.format_answer(answer=doc["answer"], label=False)
+        
+        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
+        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
+
+        return ll_true_choice, ll_false_choice
+
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        return {
+            "acc": (pred, doc)
+        }
+    
+    def higher_is_better(self):
        return {
-            "major": acc,
-            "minor": {"acc": acc},
-            "higher_is_better": True,
+            "acc": True
+        }
+    
+    def aggregation(self):
+        return {
+            "acc": acc_all
+        }
+
+
+class ReCoRD(HFTask):
+    DATASET_PATH = "super_glue"
+    DATASET_NAME = "record"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return ""
+
+    def training_docs(self):
+        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
+        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
+        # Hence, we one "doc" for each (context + passage, answer) pair.
+        # Moreover, we only use the correct answers for context packing
+        # (This is not an issue for evaluation, where we can directly score multiple candidates at once).
+        if self._training_docs is None:
+            self._training_docs = []
+            for doc in self.data["train"]:
+                for entity in list(set(doc["entities"])):
+                    self._training_docs.append({
+                        "passage": doc["passage"],
+                        "query": doc["query"],
+                        "entity": entity,
+                        "label": entity in doc["answers"],
+                    })
+        return self._training_docs
+
+    def validation_docs(self):
+        for doc in self.data["validation"]:
+            for entity in list(set(doc["entities"])):
+                yield {
+                    "passage": doc["passage"],
+                    "query": doc["query"],
+                    "entity": entity,
+                    "label": entity in doc["answers"],
+                }
+
+    def doc_to_text(self, doc):
+        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
+        text = initial_text + "\n\n"
+        for highlight in highlights:
+            text += f"  - {highlight}.\n"
+        return text
+
+    @classmethod
+    def format_answer(cls, query, entity):
+        return f'  - {query}'.replace("@placeholder", entity)
+
+    def doc_to_target(self, doc):
+        return self.format_answer(query=doc["query"], entity=doc["entity"])
+
+    def construct_requests(self, doc, ctx):
+        requests = [
+            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
+            for entity in doc["entity"]
+        ]
+        return requests
+
+    def process_results(self, doc, results):
+        # ReCoRD's evaluation is actually deceptively simple:
+        # - Pick the maximum likelihood prediction entity
+        # - Evaluate the accuracy and token F1 PER EXAMPLE
+        # - Average over all examples
+        max_idx = np.argmax(np.array(results))
+
+        prediction = doc["entities"][max_idx]
+        gold_label_set = list(set(doc["answers"]))
+        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
+        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
+
+        return {
+            "f1": f1,
+            "em": em,
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True,
+            "em": True,
+        }
+
+    def aggregation(self):
+        return {
+            "f1": mean,
+            "em": mean,
        }


@@ -194,31 +360,51 @@ class WordsInContext(HFTask):
    def has_test_docs(self):
        return True

-    def doc_to_text(self, doc, include_target=True):
-        text = "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \
-               " two sentences above?\nanswer:".format(
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return ""
+
+    def doc_to_text(self, doc):
+        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
+               " two sentences above?\nAnswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
-        if include_target:
-            text += " {}".format({0: "no", 1: "yes"}[doc["label"]])
-        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def doc_to_target(self, doc):
+        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
+
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
+        ll_no, _ = rf.loglikelihood(ctx, ' no')
+
+        return ll_yes, ll_no
+
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+
+        acc = 1. if (ll_yes > ll_no) == gold else 0.
+
+        return {
+            "acc": acc
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean
+        }


 class SGWinogradSchemaChallenge(HFTask):
+    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
+    #       binary version of the task.
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"

@@ -234,10 +420,10 @@ class SGWinogradSchemaChallenge(HFTask):
    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
-                # GPT-3 Paper's format only uses positive examples
+                # GPT-3 Paper's format only uses positive examples for fewshot "training"
                self._training_docs = [
                    doc for doc in
-                    self._load_nlp_dataset()["train"]
+                    self.data["train"]
                    if doc["label"]
                ]
            return self._training_docs
@@ -248,59 +434,47 @@ class SGWinogradSchemaChallenge(HFTask):
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
        raw_passage = doc["text"]
-        passage = (
-            raw_passage[:doc["span2_index"]]
-            + "*{}*".format(doc["span2_text"])
-            + raw_passage[doc["span2_index"] + len(doc["span2_text"]):]
-        )
+        # NOTE: HuggingFace span indices are word-based not character-based.
+        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
+        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
+        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
+        noun = doc["span1_text"]
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
-            + f"Question: In the passage above, what does the pronoun \"*{pronoun}*\" refer to?\n"
+            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
            + "Answer:"
        )
-        if include_target:
-            text += " {}".format(doc["span1_text"])
        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            to_predict = " " + doc["span1_text"]
-            num_tokens = len(lm.tokenizer.tokenize(to_predict))
-            generated = lm.generate(
-                context=ctx,
-                max_gen_length=num_tokens,
-            )
-            preds.append(1 if generated == to_predict else 0)
-        return simple_accuracy_metric(preds=preds, golds=golds)
-
-class RTE(HFTask):
-    DATASET_PATH = "super_glue"
-    DATASET_NAME = "rte"
+    def doc_to_target(self, doc):
+        return " " + yesno(doc['label'])

-    def fewshot_description(self):
-        #TODO: implement
-        pass
-
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            if doc['label'] == 0:
-                answer = 'True'
-            else:
-                answer = 'False'
-            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
-        else:
-            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        #TODO: 
-        pass
+    def construct_requests(self, doc, ctx):
+
+        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
+        ll_no, _ = rf.loglikelihood(ctx, ' no')
+
+        return ll_yes, ll_no
+
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]

+        acc = 1. if (ll_yes > ll_no) == gold else 0.
+
+        return {
+            "acc": acc
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
+import os
 import json
 import random
-from lm_eval.base import Dataset
+from lm_eval.base import Task, mean, rf
 from ..utils import sh

-class TriviaQA(Dataset):
-    def __init__(self):
-        self.download()
+class TriviaQA(Task):
    def download(self):
-        #pass
-        #TODO: don't download if files already there
-        sh("""
-           mkdir -p data/triviaqa
-           wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
-           tar -xf data/triviaqa/trivia_qa-unfiltered.tar.gz
-           mv triviaqa-unfiltered/ data/triviaqa/
-           """)
+        if not os.path.exists('data/triviaqa'):
+            sh("""
+            mkdir -p data/triviaqa
+            wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
+            tar -xf data/triviaqa/trivia_qa-unfiltered.tar.gz
+            mv triviaqa-unfiltered/ data/triviaqa/
+            """)

    def has_training_docs(self):
        return True
@@ -23,7 +21,7 @@ class TriviaQA(Dataset):
        return True

    def has_test_docs(self):
-        return True
+        return False

    def training_docs(self):
        return json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-train.json'))['Data']
@@ -35,13 +33,45 @@ class TriviaQA(Dataset):
        return  json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data']     
    
    def fewshot_description(self):
-        pass
+        # TODO: figure out fewshot description
+        return ""
    
-    def doc_to_text(self, doc, include_target=True):
-        if include_target:
-            return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
-        else:
-            return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
-    def evaluate(self, docs, lm):
-        pass
+    def doc_to_text(self, doc):
+        return ''.join(['Q:', doc['Question'], '\n\n','A:'])
+
+    def doc_to_target(self, doc):
+        return " " + doc['Answer']['Value']
+
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+
+        return ret
+        
+
+    def construct_requests(self, doc, ctx):
+        ret = []
+        for alias in self._remove_prefixes(doc['Answer']['Aliases']):
+            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+            ret.append(is_prediction)
+        return ret
+
+    def process_results(self, doc, results):
+        return {
+            "acc": float(any(results))
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }

+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
 from . common import HFTask
+from lm_eval.base import mean, rf

 class WebQs(HFTask):
    DATASET_PATH = "web_questions"
@@ -17,16 +18,45 @@ class WebQs(HFTask):
        # TODO: figure out description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        print(doc)
-        q = "Q: " + doc['question'] + '\n'
+    def doc_to_text(self, doc):
+        return "Question: " + doc['question'] + '\nAnswer:'

+    def doc_to_target(self, doc):
        # this picks one answer to be the "correct" one, despite sometimes 
        # multiple correct answers being possible.
        # TODO: make sure we're actually handling multi-answer correctly
-        a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
-        return q + a
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+        return " " + doc['answers'][0]
+        
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+
+        return ret
+        
+
+    def construct_requests(self, doc, ctx):
+        ret = []
+        for alias in self._remove_prefixes(doc['answers']):
+            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+            ret.append(is_prediction)
+        return ret
+
+    def process_results(self, doc, results):
+        return {
+            "acc": float(any(results))
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
\ No newline at end of file
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -9,22 +9,120 @@ class WikiText103(NLP_TASK):
    NLP_NAME = "wikitext-103-raw-v1"

    def fewshot_description(self):
+        # TODO: figure out fewshot description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def doc_to_text(self, doc):
+        # TODO: implement
        pass

+    def doc_to_target(self, doc):
+        # TODO: implement
+        pass
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+

 class WikiText2(NLP_TASK):
    NLP_PATH = "wikitext"
    NLP_NAME = "wikitext-2-raw-v1"

    def fewshot_description(self):
+        # TODO: figure out fewshot description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
\ No newline at end of file
+    def doc_to_text(self, doc):
+        # TODO: implement
+        pass
+
+    def doc_to_target(self, doc):
+        # TODO: implement
+        pass
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
 import numpy as np
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from . common import HFTask
+from lm_eval.base import rf, mean
+
+"""
+This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+Reference: https://arxiv.org/abs/1806.02847
+"""
+

 class Winogrande(HFTask):
    DATASET_PATH = "winogrande"
@@ -17,34 +22,80 @@ class Winogrande(HFTask):
    def has_test_docs(self):
        return True

-    def training_docs(self):
-        if self.has_training_docs():
-            return self.data["train"]
+    def fewshot_description(self):
+        # TODO: redo description
+        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."

-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.data["validation"]
+    @classmethod
+    def partial_context(cls, doc):
+        # Substitute the pronoun in the sentence with each candidate choice
+        # and ignore everything after.
+        pronoun_loc = doc["sentence"].index("_")
+        context1 = doc["sentence"][:pronoun_loc] + doc["option1"]
+        context2 = doc["sentence"][:pronoun_loc] + doc["option2"]
+        return context1, context2

-    def test_docs(self):
-        if self.has_test_docs():
-            return self.data["test"]
+    @classmethod
+    def partial_target(cls, doc):
+        # The target is everything after the document specified pronoun.
+        pronoun_loc = doc["sentence"].index("_") + 1
+        return doc["sentence"][pronoun_loc:].strip()

-    def fewshot_description(self):
-        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
+    def doc_to_text(self, doc):
+        context1, context2 = self.partial_context(doc)
+        return context1 + '\n' + context2 + '\n'
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        target = self.partial_target(doc)
+        context1, context2 = self.partial_context(doc)
+        ll_context1, _ = rf.loglikelihood(context1, " " + target)
+        ll_context2, _ = rf.loglikelihood(context2, " " + target)
+        return ll_context1, ll_context2
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        answer = int(doc["answer"]) - 1  # `- 1` b/c doc["answer"] ∈ {'1', '2'}
+        return {
+            "acc": np.argmax(results) == answer
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        return {
+            "acc": mean
+        }

-    def doc_to_text(self, doc, include_target=True):
-        text = doc['sentence']
-        if include_target:
-            answer_n = doc['answer']
-            if answer_n == '1':
-                answer = doc['option1']
-            elif answer_n == '2':
-                answer = doc['option2']
-            else:
-                raise ValueError("Winogrande from HF datasets contained an invalid answer key")
-            text = text.replace("_", answer)
-        return text
-
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        return {
+            "acc": True
+        }
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
-import json
-import random
-import os
-from lm_eval.base import Dataset
-from ..utils import sh
-
-
-class WinogradSchemaChallenge273(Dataset):    
-    def __init__(self):
-        super().__init__()
-
-    def download(self):
-        if not os.path.exists('data/wsc273'):
-            sh("""
-                mkdir -p data/wsc273 
-                wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
-                """)
-
-    def has_training_docs(self):
-        return False
-
-    def has_validation_docs(self):
-        return False
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        return []
-
-    def validation_docs(self):
-        return []
-
-    def test_docs(self):
-        myjson = json.load(open('data/wsc273/wsc273.json'))
-        return self.load_doc(myjson)
-    
-    def fewshot_description(self):
-        # This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy,
-        # to meet the needs of this particular task.
-        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
-
-    def load_doc(self, myjson):
-        docs = []
-        for i in range(0, 273 * 2, 2):
-            item1 = myjson[i]
-            item2 = myjson[i+1]
-
-            if item1['question_id'] != item2['question_id']:
-                raise ValueError("WSC273 has missing completion pair.")
-
-            question_id = item1['question_id']
-
-            if item1['correctness'] == True:
-                doc = {
-                    'id': question_id,
-                    'completions': {
-                        'T': item1['substitution'],
-                        'F': item2['substitution'],
-                    },
-                }
-                
-            if item2['correctness'] == True:
-                doc = {
-                    'id': question_id,
-                    'completions': {
-                        'F': item1['substitution'],
-                        'T': item2['substitution'],
-                    },
-                }
-
-            docs.append(doc)
- 
-        return docs
-    
-    def doc_to_text(self, doc, include_target=True):
-        # WSC273 is currently only writing out full examples. Partial evaluation needs implementing.
-        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
-        return text
-
-    def evaluate(self, docs, lm):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+import numpy as np
+import random
+from lm_eval.base import rf, mean
+from . common import HFTask
+
+"""
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.02847
+"""
+
+
+class WinogradSchemaChallenge273(HFTask):
+    DATASET_PATH = "winograd_wsc"
+    DATASET_NAME = "wsc273"
+
+    upper_pronouns = ["A", "An", "The", "She", "He",
+                      "It", "They", "My", "His", "Her", "Their"]
+
+    def __init__(self):
+        super().__init__()
+        self.data = self.__clean_data()
+
+    def __clean_data(self):
+        # The HF implementation of `wsc273` is not `partial evaluation` friendly.
+        data = []
+        for doc in self.data["test"]:
+            doc["text"] = doc["text"].replace("  ", " ")
+            doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
+            doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
+            data.append(doc)
+        return {"test": data}
+
+    def __normalize_option(self, option, doc):
+        # Append `'s` to possessive determiner based options.
+        if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]: 
+            option += "'s"
+        # Appropriately lowercase the pronoun in the option.
+        pronoun = option.split()[0]
+        start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
+        if not start_of_sentence and pronoun in self.upper_pronouns:
+            return option.replace(pronoun, pronoun.lower())
+        return option
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_examples(self, k):
+        # NOTE: `super().fewshot_examples` samples from training docs which are
+        # not available for this test-set-only dataset.
+        return random.sample(list(self.test_docs()), k)
+
+    def fewshot_description(self):
+        # TODO: redo description
+        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+
+    @classmethod
+    def partial_context(cls, doc):
+        # Substitute the pronoun in the original text with each candidate 
+        # choice and ignore everything after. 
+        context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
+        context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
+        return context1, context2
+
+    @classmethod
+    def partial_target(cls, doc):
+        # The target is everything after the document specified pronoun.
+        start_index = doc["pronoun_loc"] + len(doc["pronoun"])
+        return doc["text"][start_index:].strip()
+
+    def doc_to_text(self, doc):
+        context1, context2 = self.partial_context(doc)
+        return context1 + '\n' + context2 + '\n'
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        target = self.partial_target(doc)
+        context1, context2 = self.partial_context(doc)
+        ll_context1, _ = rf.loglikelihood(context1, " " + target)
+        ll_context2, _ = rf.loglikelihood(context2, " " + target)
+        return ll_context1, ll_context2
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        return {
+            "acc": np.argmax(results) == doc["label"]
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        return {
+            "acc": mean
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        return {
+            "acc": True
+        }
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
 import os
+import re


 class ExitCodeError(Exception):
@@ -25,3 +26,27 @@ def simple_parse_args_string(args_string):
        k, v = arg.split("=")
        args_dict[k] = v
    return args_dict
+
+def join_iters(iters):
+    for iter in iters:
+        yield from iter
+
+
+def chunks(iter, n):
+    arr = []
+    for x in iter:
+        arr.append(x)
+        if len(arr) == n:
+            yield arr
+            arr = []
+    
+    if arr: yield arr
+
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace("\" ", "\"")
+    string = string.replace(" \"", "\"")
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
\ No newline at end of file
--- a/lm_eval/utils_stream.py
+++ b/lm_eval/utils_stream.py
-import os
-from functools import reduce
-import operator
-import lm_dataformat as lmd
-from tqdm import tqdm
-import json
-
-
-class ExitCodeError(Exception): pass
-
-
-def sh(x):
-    if os.system(x): raise ExitCodeError()
-
-def ls(x):
-    return [x + '/' + fn for fn in os.listdir(x)]
-
-def lsr(x):
-    if os.path.isdir(x):
-        return reduce(operator.add, map(lsr, ls(x)), [])
-    else:
-        return [x]
-
-def fwrite(fname, content):
-    with open(fname, 'w') as fh:
-        fh.write(content)
-
-def fread(fname):
-    with open(fname) as fh:
-        return fh.read()
-
-class each:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return list(map(self.f, other))
-
-class filt:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return list(filter(self.f, other))
-
-class apply:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return self.f(other)
-
-class one:
-    def __rrshift__(self, other):
-        try:
-            if isinstance(other, list): 
-                assert len(other) == 1
-                return other[0]
-            return next(other)
-        except:
-            return None
-
-class join:
-    def __init__(self, sep):
-        self.sep = sep
-
-    def __rrshift__(self, other):
-        if other is None: return
-        try:
-            return self.sep.join(other)
-        except:
-            return None
-
-
-Y = object()
-
-def id(x):
-    return x
-
-class Reflective:
-    def __getattribute__(self, f):
-        def _fn(*args, **kwargs):
-            return lambda x: x.__getattribute__(f)(*args, **kwargs)
-        return _fn
-    
-    def __getitem__(self, a):
-        return lambda x: x[a]
-    
-    def __mul__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return x * y
-            return  _f
-        
-        return lambda x: x * other
-    
-    def __rmul__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return y * x
-            return  _f
-        
-        return lambda x: other * x
-    
-    def __add__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return x + y
-            return  _f
-        
-        return lambda x: x + other
-    
-    def __radd__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return y + x
-            return  _f
-        
-        return lambda x: other + x
-
-# (b -> a -> b) -> b -> [a] -> b
-def foldl(f, init, arr):
-    curr = init
-    for elem in arr:
-        curr = f(curr, elem)
-    return curr
-
-# (a -> b -> b) -> b -> [a] -> b
-def foldr(f, init, arr):
-    curr = init
-    for elem in arr[::-1]:
-        curr = f(elem, curr)
-    return curr
-
-
-def comp(*fs):
-    if len(fs) == 1:
-        return fs[0]
-    
-    def _f(x):
-        for f in fs[::-1]:
-            x = f(x)
-    
-        return x
-    return _f
-
-
-X = Reflective()
\ No newline at end of file