Merge pull request #7 from cfoster0/greedyuntil

Fork update and long-overdue SQuAD fixes

Merge pull request #7 from cfoster0/greedyuntil
Fork update and long-overdue SQuAD fixes
538be6da · Charles Foster · GitHub · eb4c8407 · 5be42b4d · 538be6da
Unverified Commit 538be6da authored Mar 23, 2021 by Charles Foster Committed by GitHub Mar 23, 2021
20 changed files
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
+from lm_eval.base import MultipleChoiceTask
+from best_download import download_file
+from pathlib import Path
+
+
+class LogiQA(MultipleChoiceTask):
+    DATASET_PATH = Path("data/logiqa")
+
+    def download(self):
+        if self.DATASET_PATH.exists():
+            return
+        Path.mkdir(self.DATASET_PATH)
+        base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
+        splits = [
+            {"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
+            {"name": "Eval", "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"},
+            {"name": "Test", "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}
+        ]
+        for split in splits:
+            file = self.DATASET_PATH / f"{split['name']}.txt"
+            download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"])
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def _convert_standard(self, doc):
+        return {
+            "query": "Passage: " + doc["passage"] + "\nQuestion: " + doc["question"] + "\nAnswer:",
+            "choices": doc["options"],
+            "gold": ["a", "b", "c", "d"].index(doc["answerKey"])
+        }
+
+    def _load_docs(self, filename):
+        def normalize(text):
+            return text.replace(".", ". ").strip()
+
+        with open(filename, 'r') as f:
+            docs = f.read().strip().split("\n\n")
+        for rawdoc in docs:
+            rawdoc = rawdoc.split("\n")
+            doc = {
+                "answerKey": rawdoc[0].strip(),
+                "passage": normalize(rawdoc[1]),
+                "question": normalize(rawdoc[2]),
+                "options": [normalize(option[2:]) for option in rawdoc[3:]]
+            }
+            yield self._convert_standard(doc)
+
+    def training_docs(self):
+        return self._load_docs(self.DATASET_PATH / "Train.txt")
+
+    def validation_docs(self):
+        return self._load_docs(self.DATASET_PATH / "Eval.txt")
+
+    def test_docs(self):
+        return self._load_docs(self.DATASET_PATH / "Test.txt")
+
+    def fewshot_description(self):
+        # TODO: figure out actual description
+        return ""
+
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
+from . common import HFTask
+from lm_eval.base import mean, rf, MultipleChoiceTask
+import re
+
+class MathQA(HFTask, MultipleChoiceTask):
+    DATASET_PATH = "math_qa"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def _convert_standard(self, doc):
+
+        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
+        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+
+        out_doc = {
+            "query": "Question: " + doc['Problem'] +"\nAnswer:",
+            "choices": choices,
+            "gold": answer_idx,
+        }
+        return out_doc
+
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
+
+    def training_docs(self):
+        docs = super().training_docs()
+        return self._load_docs(docs)
+
+    def validation_docs(self):
+        docs = super().validation_docs()
+        return self._load_docs(docs)
+
+    def test_docs(self):
+        docs = super().test_docs()
+        return self._load_docs(docs)
+
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
 import numpy as np
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 from . common import HFTask



--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -2,7 +2,8 @@ import numpy as np
 import json
 import random
 from .common import HFTask 
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean


 class Pubmed_QA(HFTask):

--- a/lm_eval/tasks/qa4mre.py
+++ b/lm_eval/tasks/qa4mre.py
+import os
+import numpy as np
+from best_download import download_file
+from lm_eval.base import MultipleChoiceTask, rf
+from lm_eval.metrics import mean
+import xml.etree.ElementTree as ET
+import random
+
+class QA4MRE(MultipleChoiceTask):
+    YEAR = None
+    def download(self):
+        year = self.YEAR
+        lang = "EN"
+        base_path = (
+            "http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?"
+            "file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/"
+        )
+        # TODO: add side tasks?
+        variable_year_path = {
+            2011: '2011/Training_Data/Goldstandard/',
+            2012: '2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/',
+            2013: '2013/Main_Task/Training_Data/Goldstandard/'
+        }
+        sha256sums = {
+            2011 : "6d2524952a3a015f2a82df785b85b5578681e3602ec276b4e72c01f4ebc50034",
+            2012 : "f9edaf408f8ac93f89a643a0d0b19263a1bb5ce64f19b2af10df279a656dfb24",
+            2013 : "c60e5aa4ec77e0493ef0b11d46bd1d74d58a499a3a2f871b8cf3af9536f0f094", 
+        }
+        vpath = variable_year_path[year]
+        url_path = f"{base_path}{vpath}QA4MRE-{year}-{lang}_GS.xml"
+        if not os.path.exists("data/qa4mre"):
+            os.mkdir("data/qa4mre")
+        if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
+            download_file(
+                url_path,
+                f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
+                checksum=sha256sums[year],
+                )
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_examples(self, k):
+        # Since only test docs sample from test docs
+        if self._training_docs is None:
+            self._training_docs = list(self.test_docs())
+        return random.sample(self._training_docs, k)
+
+    def _convert_standard(self, question):
+        choices = [i.text for i in question.iter('answer')]
+        out_doc = {
+            "query" : question.find('q_str').text,
+            "choices": choices, 
+            "gold" : int(question.find("./answer[@correct='Yes']").attrib["a_id"]) - 1,
+        }
+        return out_doc
+    
+    def load_docs(self, textfilename, tfds=False):
+        tree = ET.parse(textfilename)
+        root = tree.getroot()
+        # TODO: context is much larger than the context sometimes
+        # at the moment, it just gets left-truncated by LM automatically, and maybe that's good enough?
+        for reading_test in root.iter('reading-test'):
+            src = reading_test[0].text
+            src = src.strip().replace("\'", "'")
+            for qid, question in enumerate(reading_test.iter('q')):
+                out_doc = self._convert_standard(question)
+                out_doc['source'] = src
+                yield out_doc
+
+    def fewshot_description(self):
+        return ""
+
+    def test_docs(self):
+        return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")
+
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
+
+class QA4MRE_2011(QA4MRE):
+    YEAR = 2011
+
+class QA4MRE_2012(QA4MRE):
+    YEAR = 2012
+
+class QA4MRE_2013(QA4MRE):
+    YEAR = 2013
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
 import collections
 import datasets
 import numpy as np
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 from . common import HFTask

 import os

--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
 import json
 import random
 import os
-from lm_eval.base import MultipleChoiceTask, rf, mean
+from lm_eval.base import MultipleChoiceTask, rf
+from ..metrics import mean
 from tqdm import auto as tqdm_lib
 from . common import simple_accuracy_metric
 import numpy as np

--- a/lm_eval/tasks/sciq.py
+++ b/lm_eval/tasks/sciq.py
 import os
 import json
 from ..utils import sh
-from lm_eval.base import MultipleChoiceTask, rf, mean
+from lm_eval.base import MultipleChoiceTask, rf
+from ..metrics import mean
 import zipfile
+from best_download import download_file


 class SciQ(MultipleChoiceTask):
@@ -10,9 +12,11 @@ class SciQ(MultipleChoiceTask):
    def download(self):
        if not os.path.exists('data/sciq'):
            os.mkdir('data/sciq')
-            sh((
-                "wget https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip -O data/sciq/SciQ.zip"
-            ))
+            download_file(
+                'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip',
+                'data/sciq/SciQ.zip', 
+                '7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c',
+            )
            with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
                zf.extractall("data/sciq/")

@@ -48,8 +52,6 @@ class SciQ(MultipleChoiceTask):
            yield self._convert_standard(record)

    def fewshot_description(self):
-        # Average ctx length in labelled dataset is 238.9
-        # 2 few-shot exmamples pushes it beyond context window
        return ""

    def training_docs(self):

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
 import datasets
-from lm_eval.base import rf, f1_score, mean
+from math import exp
+from lm_eval.base import rf
+from lm_eval.metrics import f1_score, mean
 from . common import HFTask

 class SQuAD(HFTask):
@@ -26,7 +28,7 @@ class SQuAD(HFTask):
        return ""

    def doc_to_text(self, doc):
-        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A:'
+        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'

    def doc_to_target(self, doc):
        answer_list = doc['answers']['text']
@@ -48,7 +50,8 @@ class SQuAD(HFTask):
            part of the document for `doc`. 
        """
        continuation = rf.greedy_until(ctx, ['\n'])
-        return continuation
+        is_unanswerable = rf.loglikelihood(ctx, [' unanswerable'])
+        return continuation, is_unanswerable
    
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
@@ -62,15 +65,22 @@ class SQuAD(HFTask):
        """
        squad_metric = datasets.load_metric("squad_v2")

-        predictions = {
+        continuation, is_unanswerable = results
+
+        logprob_unanswerable, is_greedy = is_unanswerable
+
+        no_answer_probability = exp(logprob_unanswerable)
+        
+        predictions = [{
            'id': doc['id'],
-            'prediction_text': results[0],
-        }
+            'prediction_text': continuation,
+            'no_answer_probability': no_answer_probability,
+        }]

-        references = {
+        references = [{
            'id': doc['id'],
            'answers': doc['answers'],
-        }
+        }]

        metrics = squad_metric.compute(predictions=predictions, references=references)


--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -5,7 +5,8 @@ To-do:
 """
 import numpy as np
 from . common import HFTask, yesno
-from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
+from lm_eval.base import rf
+from ..metrics import mean, acc_all, metric_max_over_ground_truths
 import sklearn
 import transformers.data.metrics.squad_metrics as squad_metrics
 from ..utils import general_detokenize
@@ -218,7 +219,7 @@ class MultiRC(HFTask):
        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"

    def doc_to_target(self, doc):
-        return self.format_answer(answer=doc["answer"], label=doc["label"])
+        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])

    @staticmethod
    def format_answer(answer, label):
@@ -271,30 +272,25 @@ class ReCoRD(HFTask):
    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
-        # Hence, we one "doc" for each (context + passage, answer) pair.
-        # Moreover, we only use the correct answers for context packing
-        # (This is not an issue for evaluation, where we can directly score multiple candidates at once).
        if self._training_docs is None:
            self._training_docs = []
            for doc in self.data["train"]:
-                for entity in list(set(doc["entities"])):
-                    self._training_docs.append({
-                        "passage": doc["passage"],
-                        "query": doc["query"],
-                        "entity": entity,
-                        "label": entity in doc["answers"],
-                    })
+                self._training_docs.append(self._process_doc(doc))
        return self._training_docs

    def validation_docs(self):
+        # See: training_docs
        for doc in self.data["validation"]:
-            for entity in list(set(doc["entities"])):
-                yield {
-                    "passage": doc["passage"],
-                    "query": doc["query"],
-                    "entity": entity,
-                    "label": entity in doc["answers"],
-                }
+            yield self._process_doc(doc)
+
+    @classmethod
+    def _process_doc(cls, doc):
+        return {
+            "passage": doc["passage"],
+            "query": doc["query"],
+            "entities": sorted(list(set(doc["entities"]))),
+            "answers": sorted(list(set(doc["answers"]))),
+        }

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
@@ -308,12 +304,13 @@ class ReCoRD(HFTask):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
-        return self.format_answer(query=doc["query"], entity=doc["entity"])
+        # We only output the first correct entity in a doc
+        return self.format_answer(query=doc["query"], entity=doc["answers"][0])

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
-            for entity in doc["entity"]
+            for entity in doc["entities"]
        ]
        return requests

@@ -322,10 +319,10 @@ class ReCoRD(HFTask):
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
-        max_idx = np.argmax(np.array(results))
+        max_idx = np.argmax(np.array([result[0] for result in results]))

        prediction = doc["entities"][max_idx]
-        gold_label_set = list(set(doc["answers"]))
+        gold_label_set = doc["answers"]
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)


--- a/lm_eval/tasks/translation.py
+++ b/lm_eval/tasks/translation.py
+import abc
+import json
+import random
+import os
+from collections import Iterable
+from pprint import pprint
+
+import pycountry
+from sacrebleu import sacrebleu
+import logging
+
+from lm_eval import metrics
+from lm_eval.base import Task, rf
+
+"""
+This file implements translation tasks using datasets from WMT conferences, provided by sacrebleu.
+Traditionally they are evaluated with BLEU scores. TER and CHRF are other options.
+
+See sacrebleu.DATASETS for all available datasets. There are a lot!
+"""
+sacrebleu_datasets = sacrebleu.DATASETS
+
+
+def create_tasks_from_benchmarks(benchmark_dict):
+    """Creates a dictionary of tasks from a dict
+    :param benchmark_dict: { dataset: [lang_pair, ...], }
+    :return: {task_name: task}
+        e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
+    """
+    return {
+        f"{dataset}-{language_pair}": create_translation_task(dataset, language_pair)
+        for dataset, language_pairs in benchmark_dict.items()
+        for language_pair in language_pairs
+    }
+
+########################################
+# Tasks
+########################################
+
+def create_translation_task(dataset, language_pair):
+    class TranslationTask(GeneralTranslationTask):
+        def __init__(self):
+            super().__init__(dataset, language_pair)
+    return TranslationTask
+
+class GeneralTranslationTask(Task):
+
+    # e.g. ("wmt14", "fr-en")
+    def __init__(self, sacrebleu_dataset, sacrebleu_language_pair=None):
+        self.sacrebleu_dataset = sacrebleu_dataset
+        self.sacrebleu_language_pair = sacrebleu_language_pair
+        self.src_file = self.ref_file = self.src_data = self.ref_data = None
+
+        super().__init__()
+
+    def download(self):
+        # This caches in the users home dir automatically
+        self.src_file, self.ref_file = \
+            sacrebleu.download_test_set(self.sacrebleu_dataset, self.sacrebleu_language_pair)
+        self.src_data, self.ref_data = [
+            [line.rstrip() for line in sacrebleu.smart_open(file)]
+            for file in (self.src_file, self.ref_file)
+        ]
+
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        # TODO In the future we could be more discerning. Some more recent tests have train and dev sets
+        return False
+
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        return False
+
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        return True
+
+    def test_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return [{
+            "src": src,
+            "ref": ref
+        } for src, ref in zip(self.src_data, self.ref_data)]
+
+    def doc_to_text(self, doc):
+        return doc["src"]
+
+    def doc_to_target(self, doc):
+        # This shows a single target, though there may be multiple targets in a lang test
+        return doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        return rf.greedy_until(ctx, ["\n"])
+
+    def process_results(self, doc, results):
+        # These metrics are corpus-level not sentence level, so we'll hide the
+        # results in this dict and compute the corpus score in the aggregate method
+        ref_pred = (doc["ref"], results)
+        return {
+            "bleu": ref_pred,
+            "chrf": ref_pred,
+            "ter": ref_pred,
+        }
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {
+            "bleu": metrics.bleu,
+            "chrf": metrics.chrf,
+            "ter": metrics.ter,
+        }
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {
+            "bleu": True,
+            "chrf": True,
+            "ter": False,
+        }
+
+    def fewshot_description(self):
+        language_codes = self.sacrebleu_language_pair.split("-")
+        src_lang = code_to_language(language_codes[0])
+        tar_lang = code_to_language(language_codes[1])
+        return f"Translate these {src_lang} phrases to {tar_lang}."
+
+    def __str__(self):
+        language_codes = self.sacrebleu_language_pair.split("-")
+        src_lang = code_to_language(language_codes[0])
+        tar_lang = code_to_language(language_codes[1])
+        return f"{self.sacrebleu_dataset.upper()} {src_lang} to {tar_lang} Task"
+
+
+########################################
+# Util
+########################################
+
+
+def code_to_language(code):
+    # key is alpha_2 or alpha_3 depending on the code length
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name
+
+def print_available_tests():
+    pprint({ts: sacrebleu.get_langpairs_for_testset(ts) for ts in sacrebleu.get_available_testsets()})
+
+
+def print_available_pairs():
+    list_of_pairs = [sacrebleu.get_langpairs_for_testset(ts) for ts in sacrebleu.get_available_testsets()]
+    pairs = set([item for sublist in list_of_pairs for item in sublist])
+    pairs = sorted(["-".join(map(code_to_language, pair.split("-"))) for pair in pairs])
+    pprint(pairs)
+    print(len(pairs))
+
+
+def main():
+    # print(sacrebleu.download_test_set("wmt14", "en-fr"))
+    # print_available_tests()
+    # sacrebleu.print_test_set("wmt14", "fr-en", "src")
+
+    # # Print number of benchmarks
+    # print(sum([
+    #     len(sacrebleu.get_langpairs_for_testset(ts))
+    #     for ts in sacrebleu.get_available_testsets()
+    # ]))
+
+    # Test task dictionary
+    # for task, task_class in create_tasks_from_benchmarks(selected_benchmarks).items():
+    #     print(task, task_class())
+    print_available_pairs()
+    pass
+
+
+if __name__ == "__main__":
+    main()
+
+
+########################################
+# Don't mind me...!
+########################################
+
+# Available tests as of 2020/02/11
+"""
+{'iwslt17': ['en-fr',
+             'fr-en',
+             'en-de',
+             'de-en',
+             'en-zh',
+             'zh-en',
+             'en-ar',
+             'ar-en',
+             'en-ja',
+             'ja-en',
+             'en-ko',
+             'ko-en'],
+ 'iwslt17/dev2010': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2010': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2011': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2012': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2013': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2014': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2015': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'iwslt17/tst2016': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
+ 'mtnt1.1/test': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
+ 'mtnt1.1/train': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
+ 'mtnt1.1/valid': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
+ 'mtnt2019': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
+ 'multi30k/2016': ['en-fr', 'en-de', 'en-cs'],
+ 'multi30k/2017': ['en-fr', 'en-de'],
+ 'multi30k/2018': ['en-fr', 'en-de'],
+ 'wmt08': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'es-en',
+           'en-es',
+           'fr-en',
+           'en-fr',
+           'hu-en',
+           'en-hu'],
+ 'wmt08/europarl': ['de-en', 'en-de', 'es-en', 'en-es', 'fr-en', 'en-fr'],
+ 'wmt08/nc': ['cs-en', 'en-cs'],
+ 'wmt09': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'es-en',
+           'en-es',
+           'fr-en',
+           'en-fr',
+           'hu-en',
+           'en-hu',
+           'it-en',
+           'en-it'],
+ 'wmt10': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'es-en',
+           'en-es',
+           'fr-en',
+           'en-fr'],
+ 'wmt11': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'fr-en',
+           'en-fr',
+           'es-en',
+           'en-es'],
+ 'wmt12': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'es-en',
+           'en-es',
+           'fr-en',
+           'en-fr'],
+ 'wmt13': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'es-en',
+           'en-es',
+           'fr-en',
+           'en-fr',
+           'ru-en',
+           'en-ru'],
+ 'wmt14': ['cs-en',
+           'en-cs',
+           'de-en',
+           'en-de',
+           'en-fr',
+           'fr-en',
+           'en-hi',
+           'hi-en',
+           'en-ru',
+           'ru-en'],
+ 'wmt14/full': ['cs-en',
+                'en-cs',
+                'de-en',
+                'en-de',
+                'en-fr',
+                'fr-en',
+                'en-hi',
+                'hi-en',
+                'en-ru',
+                'ru-en'],
+ 'wmt15': ['en-fr',
+           'fr-en',
+           'cs-en',
+           'de-en',
+           'en-cs',
+           'en-de',
+           'en-fi',
+           'en-ru',
+           'fi-en',
+           'ru-en'],
+ 'wmt16': ['cs-en',
+           'de-en',
+           'en-cs',
+           'en-de',
+           'en-fi',
+           'en-ro',
+           'en-ru',
+           'en-tr',
+           'fi-en',
+           'ro-en',
+           'ru-en',
+           'tr-en'],
+ 'wmt16/B': ['en-fi'],
+ 'wmt16/dev': ['en-ro', 'en-tr', 'ro-en', 'tr-en'],
+ 'wmt16/tworefs': ['en-fi'],
+ 'wmt17': ['cs-en',
+           'de-en',
+           'en-cs',
+           'en-de',
+           'en-fi',
+           'en-lv',
+           'en-ru',
+           'en-tr',
+           'en-zh',
+           'fi-en',
+           'lv-en',
+           'ru-en',
+           'tr-en',
+           'zh-en'],
+ 'wmt17/B': ['en-fi'],
+ 'wmt17/dev': ['en-lv', 'en-zh', 'lv-en', 'zh-en'],
+ 'wmt17/improved': ['en-zh', 'zh-en'],
+ 'wmt17/ms': ['zh-en'],
+ 'wmt17/tworefs': ['en-fi'],
+ 'wmt18': ['cs-en',
+           'de-en',
+           'en-cs',
+           'en-de',
+           'en-et',
+           'en-fi',
+           'en-ru',
+           'et-en',
+           'fi-en',
+           'ru-en',
+           'en-tr',
+           'tr-en',
+           'en-zh',
+           'zh-en'],
+ 'wmt18/dev': ['et-en', 'en-et'],
+ 'wmt18/test-ts': ['cs-en',
+                   'de-en',
+                   'en-cs',
+                   'en-de',
+                   'en-et',
+                   'en-fi',
+                   'en-ru',
+                   'et-en',
+                   'fi-en',
+                   'ru-en',
+                   'en-tr',
+                   'tr-en',
+                   'en-zh',
+                   'zh-en'],
+ 'wmt19': ['cs-de',
+           'de-cs',
+           'de-en',
+           'de-fr',
+           'en-cs',
+           'en-de',
+           'en-fi',
+           'en-gu',
+           'en-kk',
+           'en-lt',
+           'en-ru',
+           'en-zh',
+           'fi-en',
+           'fr-de',
+           'gu-en',
+           'kk-en',
+           'lt-en',
+           'ru-en',
+           'zh-en'],
+ 'wmt19/dev': ['lt-en', 'en-lt', 'gu-en', 'en-gu', 'kk-en', 'en-kk'],
+ 'wmt19/google/ar': ['en-de'],
+ 'wmt19/google/arp': ['en-de'],
+ 'wmt19/google/hqall': ['en-de'],
+ 'wmt19/google/hqp': ['en-de'],
+ 'wmt19/google/hqr': ['en-de'],
+ 'wmt19/google/wmtp': ['en-de'],
+ 'wmt20': ['cs-en',
+           'de-en',
+           'de-fr',
+           'en-cs',
+           'en-de',
+           'en-iu',
+           'en-ja',
+           'en-km',
+           'en-pl',
+           'en-ps',
+           'en-ru',
+           'en-ta',
+           'en-zh',
+           'fr-de',
+           'iu-en',
+           'ja-en',
+           'km-en',
+           'pl-en',
+           'ps-en',
+           'ru-en',
+           'ta-en',
+           'zh-en'],
+ 'wmt20/dev': ['iu-en',
+               'en-iu',
+               'ja-en',
+               'en-ja',
+               'pl-en',
+               'en-pl',
+               'ta-en',
+               'en-ta'],
+ 'wmt20/robust/set1': ['en-ja', 'en-de'],
+ 'wmt20/robust/set2': ['en-ja', 'ja-en'],
+ 'wmt20/robust/set3': ['de-en'],
+ 'wmt20/tworefs': ['de-en', 'en-de', 'en-zh', 'ru-en', 'zh-en']}
+"""
\ No newline at end of file
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
 import os
 import json
 import random
-from lm_eval.base import Task, mean, rf
+from lm_eval.base import Task, rf
+from ..metrics import mean
 from ..utils import sh

 class TriviaQA(Task):

--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
+import gzip
+import json
+import random
+import shutil
+from pathlib import Path
+from best_download import download_file
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+
+def extract_gzip(gz, to):
+    with gzip.open(gz, 'rb') as fin:
+        with open(to, 'wb') as fout:
+            shutil.copyfileobj(fin, fout)
+
+
+class WordUnscrambleTask(Task):
+    BASE_PATH = Path("data/unscramble")
+    FILENAME = None
+    CHECKSUM = None  # SHA256 Checksum.
+
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        if not self.BASE_PATH.exists():
+            Path.mkdir(self.BASE_PATH)
+        file = self.BASE_PATH / self.FILENAME
+        if not file.exists():
+            rawfile = file.parent / (file.name + ".gz")
+            base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
+            download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM)
+            extract_gzip(gz=rawfile, to=file)
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def validation_docs(self):
+        file = self.BASE_PATH / self.FILENAME
+        return (json.loads(line) for line in open(file).read().splitlines())
+
+    def fewshot_description(self):
+        return "Please unscramble the letters into a word, and write that word:"
+
+    def doc_to_text(self, doc):
+        return doc["context"]
+
+    def doc_to_target(self, doc):
+        return doc["completion"]
+
+    def construct_requests(self, doc, ctx):
+        completion = rf.greedy_until(ctx, ["\n"])
+        return completion
+
+    def process_results(self, doc, results):
+        pred = results[0]
+        gold = doc["completion"]
+        return {
+            "acc": int(pred == gold)
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+
+
+class Anagrams1(WordUnscrambleTask):
+    FILENAME = "mid_word_1_anagrams.jsonl"
+    CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
+
+
+class Anagrams2(WordUnscrambleTask):
+    FILENAME = "mid_word_2_anagrams.jsonl"
+    CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
+
+
+class CycleLetters(WordUnscrambleTask):
+    FILENAME = "cycle_letters_in_word.jsonl"
+    CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
+
+
+class RandomInsertion(WordUnscrambleTask):
+    FILENAME = "random_insertion_in_word.jsonl"
+    CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
+
+
+class ReversedWords(WordUnscrambleTask):
+    FILENAME = "reversed_words.jsonl"
+    CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
 from . common import HFTask
-from lm_eval.base import mean, rf
+from lm_eval.base import rf
+from ..metrics import mean
+

 class WebQs(HFTask):
    DATASET_PATH = "web_questions"

--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
 import numpy as np
 from . common import HFTask
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean

 """
 This evaluation of Winogrande uses partial evaluation as described by
@@ -13,6 +14,8 @@ class Winogrande(HFTask):
    DATASET_PATH = "winogrande"
    DATASET_NAME = "winogrande_xl"

+    answer_to_num = {'1': 0, '2': 1}
+
    def has_training_docs(self):
        return True

@@ -20,54 +23,59 @@ class Winogrande(HFTask):
        return True

    def has_test_docs(self):
-        return True
+        return False
+
+    def doc_to_text(self, doc):
+        return self.partial_context(doc, doc["option" + doc["answer"]])

    def fewshot_description(self):
        # TODO: redo description
        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."

    @classmethod
-    def partial_context(cls, doc):
-        # Substitute the pronoun in the sentence with each candidate choice
+    def partial_context(cls, doc, option):
+        # Substitute the pronoun in the sentence with the specified option
        # and ignore everything after.
        pronoun_loc = doc["sentence"].index("_")
-        context1 = doc["sentence"][:pronoun_loc] + doc["option1"]
-        context2 = doc["sentence"][:pronoun_loc] + doc["option2"]
-        return context1, context2
+        return doc["sentence"][:pronoun_loc] + option
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)

    @classmethod
    def partial_target(cls, doc):
        # The target is everything after the document specified pronoun.
        pronoun_loc = doc["sentence"].index("_") + 1
-        return doc["sentence"][pronoun_loc:].strip()
-
-    def doc_to_text(self, doc):
-        context1, context2 = self.partial_context(doc)
-        return context1 + '\n' + context2 + '\n'
-
-    def doc_to_target(self, doc):
-        return self.partial_target(doc)
+        return " " + doc["sentence"][pronoun_loc:].strip()

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
        """
        target = self.partial_target(doc)
-        context1, context2 = self.partial_context(doc)
-        ll_context1, _ = rf.loglikelihood(context1, " " + target)
-        ll_context2, _ = rf.loglikelihood(context2, " " + target)
-        return ll_context1, ll_context2
+        lls = []
+        for option in [doc["option1"], doc["option2"]]:
+            partial_ctx = self.partial_context(doc, option)
+            full_ctx = self.append_context(ctx, partial_ctx)
+            lls.append(rf.loglikelihood(full_ctx, target)[0])
+        return lls
+
+    @classmethod
+    def append_context(cls, ctx, partial_ctx):
+        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
+        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
+        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx

    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

        :param doc:
@@ -75,15 +83,14 @@ class Winogrande(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        answer = int(doc["answer"]) - 1  # `- 1` b/c doc["answer"] ∈ {'1', '2'}
        return {
-            "acc": np.argmax(results) == answer
+            "acc": np.argmax(results) == self.answer_to_num[doc["answer"]]
        }

    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
        return {
@@ -93,7 +100,7 @@ class Winogrande(HFTask):
    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
        return {

--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
 import numpy as np
 import random
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 from . common import HFTask

 """
@@ -26,14 +27,14 @@ class WinogradSchemaChallenge273(HFTask):
        data = []
        for doc in self.data["test"]:
            doc["text"] = doc["text"].replace("  ", " ")
-            doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
-            doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
+            doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
+            doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
            data.append(doc)
        return {"test": data}

-    def __normalize_option(self, option, doc):
+    def __normalize_option(self, doc, option):
        # Append `'s` to possessive determiner based options.
-        if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]: 
+        if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
            option += "'s"
        # Appropriately lowercase the pronoun in the option.
        pronoun = option.split()[0]
@@ -51,56 +52,61 @@ class WinogradSchemaChallenge273(HFTask):
    def has_test_docs(self):
        return True

+    def fewshot_description(self):
+        # TODO: redo description
+        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+
    def fewshot_examples(self, k):
        # NOTE: `super().fewshot_examples` samples from training docs which are
        # not available for this test-set-only dataset.
        return random.sample(list(self.test_docs()), k)

-    def fewshot_description(self):
-        # TODO: redo description
-        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+    def doc_to_text(self, doc):
+        return self.partial_context(doc, doc["options"][doc["label"]])

    @classmethod
-    def partial_context(cls, doc):
-        # Substitute the pronoun in the original text with each candidate 
-        # choice and ignore everything after. 
-        context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
-        context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
-        return context1, context2
+    def partial_context(cls, doc, option):
+        # Substitute the pronoun in the original text with the specified
+        # option and ignore everything after.
+        return doc["text"][:doc["pronoun_loc"]] + option
+
+    def doc_to_target(self, doc):
+        return self.partial_target(doc)

    @classmethod
    def partial_target(cls, doc):
        # The target is everything after the document specified pronoun.
        start_index = doc["pronoun_loc"] + len(doc["pronoun"])
-        return doc["text"][start_index:].strip()
-
-    def doc_to_text(self, doc):
-        context1, context2 = self.partial_context(doc)
-        return context1 + '\n' + context2 + '\n'
-
-    def doc_to_target(self, doc):
-        return self.partial_target(doc)
+        return " " + doc["text"][start_index:].strip()

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
        """
        target = self.partial_target(doc)
-        context1, context2 = self.partial_context(doc)
-        ll_context1, _ = rf.loglikelihood(context1, " " + target)
-        ll_context2, _ = rf.loglikelihood(context2, " " + target)
-        return ll_context1, ll_context2
+        lls = []
+        for option in doc["options"]:
+            partial_ctx = self.partial_context(doc, option)
+            full_ctx = self.append_context(ctx, partial_ctx)
+            lls.append(rf.loglikelihood(full_ctx, target)[0])
+        return lls
+
+    @classmethod
+    def append_context(cls, ctx, partial_ctx):
+        ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
+        ctx.pop()  # Remove the correct context put in by `doc_to_text`.
+        return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx

    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

        :param doc:
@@ -115,7 +121,7 @@ class WinogradSchemaChallenge273(HFTask):
    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
        return {
@@ -125,7 +131,7 @@ class WinogradSchemaChallenge273(HFTask):
    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
        return {

--- a/main.py
+++ b/main.py
@@ -20,7 +20,7 @@ def parse_args():
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--output_path', default=None)
    parser.add_argument('--limit', type=int, default=None)
-    parser.add_argument('--cache', action="store_true")
+    parser.add_argument('--no_cache', action="store_true")
    return parser.parse_args()

 def main():
@@ -30,8 +30,11 @@ def main():
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)
+    
+    if args.limit:
+        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

-    if args.cache:
+    if not args.no_cache:
        lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS

--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,7 @@ scikit-learn>=0.24.1
 torch>=1.7
 transformers>=4.1
 sqlitedict==1.6.0
-pytablewriter
\ No newline at end of file
+pytablewriter==0.58.0
+sacrebleu==1.5.0
+pycountry==20.7.3
+numexpr==2.7.2
\ No newline at end of file
--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
+janitor.py contains a script to remove benchmark data contamination from training data sets. 
+It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
+
+## Algorithm
+1) Collects all contamination text files that are to be removed from training data
+2) Filters training data by finding `N`gram matches between the training data 
+   and any contamination
+   1) `N`grams ignore case and punctation and are split on whitespace.  
+   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around 
+    the match, splitting the training data into chunks
+   3) Any chunks less than `minimum_slice_length` are removed
+   4) Training data sets split into more than `too_dirty_cutoff` are considered
+    completey contaminated and removed
+      
+OpenAI used:
+```
+ngram_n = 13
+window_to_remove = 200
+minimum_slice_length = 200
+too_dirty_cutoff = 10
+```
+
+## Compling
+
+Janitor can be used as a pure python program, but it is much faster if the ngram
+code is run in C++. To compile the C++ code, run
+
+```
+pip install pybind11
+c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
+```
+
+If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
+
--- a/scripts/clean_training_data/janitor.cpp
+++ b/scripts/clean_training_data/janitor.cpp
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+#include <string>
+#include <vector>
+#include <tuple>
+#include <queue>
+
+bool is_whitespace(char ch) noexcept {
+    // " \t\n\r\x0b\x0c" (python string.whitespace)
+    return ch == 32 or (9 <= ch and ch <= 13);
+//    return ch <= 32; // arguably too general, but slightly faster
+}
+
+bool is_punctuation(char c) noexcept {
+    // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,  91-96,  123-126
+    return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126);
+}
+
+// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
+// Returns a LARGE array of ngrams
+std::vector<std::string> clean_ngram(
+    std::string const & input, std::string const & ignore, size_t ngram_n
+) noexcept {
+
+    size_t num_grams = 0;
+    std::vector<std::string> ngram_list;
+    std::vector<uint8_t> gram_lengths;
+    std::string current_ngram;
+
+    // Max gram length is set to 10 below.
+    current_ngram.reserve(11*ngram_n);
+    gram_lengths.reserve(ngram_n);
+
+    bool started_gram = false;
+    gram_lengths.push_back(0);
+
+    //for (size_t i=0; i<input.length(); i++) {
+    // this is slightly faster, and we don't need the index in this one
+    for (auto iter = input.begin(); iter != input.end(); iter++) {
+
+        // If whitespace, end the current ngram and start the next
+        // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ... }
+        if (is_whitespace(*iter) || gram_lengths.back() > 10) {
+
+            // Skip all whitespace
+            while (++iter != input.end() && is_whitespace(*iter));
+            iter--;
+
+            if (started_gram){
+                num_grams += 1;
+
+                // Building 1grams is a special case
+                if (ngram_n == 1){
+                    ngram_list.push_back(current_ngram);
+                    current_ngram = current_ngram.substr(gram_lengths.front());
+                    gram_lengths.back() = 0;
+
+                // If there are enough grams to form an ngram, save
+                } else if (num_grams >= ngram_n){
+                    // Save the current ngram
+                    ngram_list.push_back(current_ngram);
+
+                    // Start the next ngram by dropping the first gram and its space from the ngram
+                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+                    current_ngram += ' ';
+
+                    // Drop the length of the first gram and prepare to record the length of the new gram
+                    gram_lengths.erase(gram_lengths.begin());
+                    gram_lengths.push_back(0);
+
+                // Otherwise, continute building
+                } else {
+                    current_ngram += ' ';
+                    gram_lengths.push_back(0);
+                }
+
+                started_gram = false;
+            }
+
+
+        // Skip ignored characters
+        // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue;
+        } else if (ignore.find(*iter) != std::string::npos) {
+            continue;
+        }
+
+        // If it is a non-ignored character, add it to the ngram and update the last gram's length
+        else {
+            current_ngram += tolower(*iter);
+            gram_lengths.back() += 1;
+            started_gram = true;
+        }
+    }
+
+    return ngram_list;
+}
+
+
+// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
+// Returns a LARGE array of tuples of (ngram, start_idx, end_idx)
+std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
+    std::string const & input, std::string const & ignore, size_t ngram_n
+) noexcept {
+
+    size_t num_grams = 0;
+    std::vector<std::tuple<std::string, size_t, size_t> > ngram_list;
+    std::vector<uint8_t> gram_lengths;
+    std::vector<size_t> gram_start_indices;
+    std::string current_ngram;
+
+    // Max gram length is set to 10 below.
+    current_ngram.reserve(11*ngram_n);
+
+    bool started_gram = false;
+    gram_lengths.push_back(0);
+    gram_start_indices.push_back(0);
+
+    for (size_t i=0; i<input.length(); i++) {
+        char ch = input[i];
+
+        // If whitespace, end the current ngram and start the next
+        if (is_whitespace(ch) || gram_lengths.back() > 10) {
+
+            // Skip all whitespace
+            while (++i < input.length() && is_whitespace(input[i]));
+            i--;
+
+            if (started_gram){
+                num_grams += 1;
+
+                // Building 1grams is a special case
+                if (ngram_n == 1){
+                    ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i));
+                    current_ngram = current_ngram.substr(gram_lengths.front());
+                    gram_lengths.back() = 0;
+                    gram_start_indices.back() = i+1;
+
+                // If there are enough grams to form an ngram, save
+                } else if (num_grams >= ngram_n){
+
+                    // Save the current ngram
+                    ngram_list.push_back(
+                        std::make_tuple(current_ngram, gram_start_indices.front(), i)
+                    );
+
+                    // Start the next ngram by dropping the first gram and its space from the ngram
+                    current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+                    current_ngram += ' ';
+
+                    // Drop the length of the first gram and prepare to record the length of the new gram
+                    gram_lengths.erase(gram_lengths.begin());
+                    gram_lengths.push_back(0);
+
+                    gram_start_indices.erase(gram_start_indices.begin());
+                    gram_start_indices.push_back(i+1);
+
+                // Otherwise, continute building
+                } else {
+                    current_ngram += ' ';
+                    gram_lengths.push_back(0);
+                    gram_start_indices.push_back(i+1);
+                }
+
+                started_gram = false;
+            }
+
+        // Skip ignored characters
+        } else if (ignore.find(*iter) != std::string::npos) {
+            continue;
+
+        // If it is a non-ignored character, add it to the ngram and update the last gram's length
+        } else {
+            current_ngram += tolower(ch);
+            gram_lengths.back() += 1;
+            started_gram = true;
+        }
+    }
+
+    return ngram_list;
+}
+
+
+PYBIND11_MODULE(janitor_util, m) {
+    m.doc() = "pybind11 example plugin"; // optional module docstring
+//    m.def("add", &add, "A function which adds two numbers");  // example function
+    m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters");
+    m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters");
+}
+
+// Example compile
+// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
+// If python and gcc aren't linked, append to the above:    -undefined dynamic_lookup
\ No newline at end of file