Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into task-guide

4d147bdd · Jonathan Tow · 011cc891 · dc937d4b · 4d147bdd · 4d147bdd
Commit 4d147bdd authored Sep 17, 2021 by Jonathan Tow
20 changed files
--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
+"""
+Outputs all 13-grams found in The Pile.
+
+Loops through all documents and uses the logic found in janitor.py to extract 13-grams. 
+We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the 
+next stage. We also include the current pile document_id with each ngram instance to allow the 
+filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
+
+We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
+resuming hard (and we had the storage).
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the pile distribution. An "output" subdirectory will be created underneath
+    to store the bucketed 13-grams, checkpoint and done files. Default: current directory
+--n_value (-n)
+    n value in n-gram, added for later use if ever needed. Default: 13
+--bucket_count (-buckets)
+    Number of file buckets to use when generating 13grams. Default: 500
+"""
+
+import argparse
+import pickle
+import os
+from pathlib import Path
+import glob
+import signal
+from signal import SIGINT
+
+from tqdm import tqdm
+
+from scripts.clean_training_data.janitor import Janitor, word_ngrams
+from scripts.clean_training_data.archiver import TextArchive, Reader
+
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+
+pile_document_count = 210607728
+
+terminate = False
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+
+def get_pile(directory):
+    reader = Reader()
+    for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
+        for document in reader.read(file):
+            yield document
+
+def close_buckets(buckets):
+    for bucket in buckets:
+        bucket.commit()
+
+def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
+
+    output_directory = os.path.join(working_directory, "output")
+    os.makedirs(output_directory, exist_ok=True)
+
+    logger.info(f"Generating {n_value}-grams and bucketing.")
+
+    # Done file
+    done_file = os.path.join(output_directory, f"ngram_buckets.done")
+    if os.path.exists(done_file):
+        logger.info("ngrams already generated and bucketed, skipping")
+        return
+
+    # Checkpoint
+    checkpoint_file = os.path.join(output_directory, f"ngram_buckets.ckpt")
+    if os.path.exists(checkpoint_file):
+        start_id = pickle.load(open(checkpoint_file,"rb"))
+    else:
+        start_id = 0
+
+    logger.info(f"Starting at pile document index {start_id}")
+    bucket_files = [os.path.join(output_directory, f"ngrams_{i}.bkt.txt") for i in range(bucket_count)]
+    buckets = list(map(TextArchive, bucket_files))
+
+    janitor = Janitor()
+    current_id = 0
+    batch_size = 1000
+    batch_counter = 0
+    with tqdm(total=pile_document_count, dynamic_ncols=True, unit="docs") as progress:
+        for document in get_pile(working_directory):
+            if current_id < start_id:
+                if terminate:
+                    close_buckets(buckets)
+                    return
+
+                current_id += 1
+                progress.update()
+                continue
+
+            # Save checkpoint every "batch_size", only allow terminate after checkpoint
+            if batch_counter == batch_size:
+                progress.update(batch_size)
+                batch_counter = 0
+                pickle.dump(current_id, open(checkpoint_file,"wb"))
+                if terminate:
+                    close_buckets(buckets)
+                    return
+
+            ngrams = word_ngrams(janitor.normalize_string(document), n_value)
+            for ngram in ngrams:
+                bucket = hash(ngram) % len(buckets)
+                buckets[bucket].add_data(f"{ngram} {current_id}")
+
+            batch_counter += 1
+            current_id += 1
+    
+    close_buckets(buckets)
+    Path(done_file).touch()
+
+
+parser = argparse.ArgumentParser(description='Generate 13 grams from Pile.')
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-n", "--n_value", type=int, default=13)
+parser.add_argument("-buckets", "--bucket_count", type=int, default=500)
+
+if __name__ == '__main__':
+
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+
+    logfile_path = "ngrams.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    do_ngrams_in_buckets(args.n_value, args.working_directory, args.bucket_count)
\ No newline at end of file
--- a/scripts/clean_training_data/janitor.py
+++ b/scripts/clean_training_data/janitor.py
@@ -41,6 +41,29 @@ def word_ngrams(s, n):
    ngram_seqs = form_ngrams(iter(tokens), n)
    return (" ".join(ngram) for ngram in ngram_seqs)

+# Does character sequences only - combined faster function to play around with later
+# def word_ngrams_indices_combined(sequence, n):
+#     current_word = ""
+#     history = []
+#     gap = False;
+#     start = 0
+#     end = 0
+#     for character in sequence:
+#         if character == " ":
+#             if not gap:
+#                 gap = True
+#                 history.append(current_word)
+#                 end += len(current_word) - 1
+#                 current_word = ""
+#                 if len(history) == n:
+#                     yield (tuple(history), start, end)
+#                     del history[0]
+#                     start = end + 1
+#                     end = start
+#         else:
+#             gap = False
+#             current_word += character
+

 # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
 def split_indices(s):
@@ -140,8 +163,9 @@ class Janitor:
    def _split_chunks(self, dirty_string, dirty_parts):
        clean_chunks = []
        splice_idx = 0
+        end = -1
        for i, (ngram, start, end) in enumerate(dirty_parts):
-            if i > self.too_dirty_cutoff:
+            if i >= self.too_dirty_cutoff:
                return []
            start = max(0, start - self.window_to_remove)
            end = min(len(dirty_string), end + self.window_to_remove)
@@ -150,6 +174,9 @@ class Janitor:
                clean_chunks.append(dirty_string[splice_idx: start])
            splice_idx = end

+        if end < len(dirty_string) - self.minimum_slice_length:
+            clean_chunks.append(dirty_string[end+1:])
+
        return clean_chunks

    ##############
@@ -186,101 +213,101 @@ class Janitor:
 # Tests
 #################################################################

-def print_cpp():
-    source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-
-    for i in range(1, 10, 2):
-        pprint(janitor_util.clean_ngram(source, string.punctuation, i))
-        for ngram, start, end in \
-                janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
-            print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
-
-
-def test_cpp():
-    source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-    contaminant = "dirty boy. Clean he he"
-
-    jan_python = Janitor()
-    jan_cpp = Janitor()
-
-    jan_python.register_contaminant_python(contaminant)
-    jan_cpp.register_contaminant(contaminant)
-
-    assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
-
-    assert jan_python.clean_python(source) == jan_cpp.clean(source), \
-        (jan_python.clean_python(source), jan_cpp.clean(source))
-
-    print("Passed test, python==cpp")
-
-
-def benchmark():
-    # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
-    setup = \
-        """
-        with open("data/enwik8", "r") as f:
-            data = f.read()
-        jan = Janitor(too_dirty_cutoff=1000)
-        jan.register_contaminant('''
-        theories is that there is a connection between &quot;geekdom&quot; and autism.  
-        This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
-        The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights 
-        movement{{ref|Wired}}.  This article, many professionals assert, is just one example of 
-        the media's application of mental disease labels to what is actually variant normal behavior
-        &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
-        interests, even when they seem unusual to others, are not in themselves signs of autism or 
-        Asperger's syndrome. Others assert that it is actually the medical profession which is applying
-        mental disease labels to children who in the past would have simply been accepted as a little
-        different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
-        Due to the recent publicity surrounding autism and autis
-        ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
-        oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first 
-        paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties 
-        would last, took a cautious approach, prefering to save the revenue rather than investing it in 
-        development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential 
-        to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his 
-        brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]], 
-        with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M, 
-        ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), 
-        ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the 
-        Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the 
-        [[United Arab Emirates]]. After the Emirates gained independence in 1971, 
-        ''')
-        """
-
-    n = 1
-    print(f"Timing {n} run on 100 MB")
-    print("Register contaminant")
-    # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
-    print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
-
-    print("Clean")
-    # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
-    print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
-
-
-def test():
-    source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-    contaminant = "dirty boy. Clean he he"
-
-    jan = Janitor(ngram_n=3)
-    jan.register_contaminant(contaminant)
-    cleaned = " ".join(jan.clean(source))
-    for contam in jan.dirt_ngrams:
-        assert contam not in cleaned, contam
-
-    filename = "data/saved_contam"
-    jan.save_contamination_ngrams(filename)
-
-    jan = Janitor(ngram_n=3)
-    jan.load_contamination_ngrams(filename)
-    cleaned = " ".join(jan.clean(source))
-    for contam in jan.dirt_ngrams:
-        assert contam not in cleaned, contam
-
-
-if __name__ == "__main__":
-    test()
-    # print_cpp()
-    # test_cpp()
-    # benchmark()
+# def print_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+
+#     for i in range(1, 10, 2):
+#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
+#         for ngram, start, end in \
+#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
+#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
+
+
+# def test_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+
+#     jan_python = Janitor()
+#     jan_cpp = Janitor()
+
+#     jan_python.register_contaminant_python(contaminant)
+#     jan_cpp.register_contaminant(contaminant)
+
+#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
+
+#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
+#         (jan_python.clean_python(source), jan_cpp.clean(source))
+
+#     print("Passed test, python==cpp")
+
+
+# def benchmark():
+#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
+#     setup = \
+#         """
+#         with open("data/enwik8", "r") as f:
+#             data = f.read()
+#         jan = Janitor(too_dirty_cutoff=1000)
+#         jan.register_contaminant('''
+#         theories is that there is a connection between &quot;geekdom&quot; and autism.  
+#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
+#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights 
+#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of 
+#         the media's application of mental disease labels to what is actually variant normal behavior
+#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
+#         interests, even when they seem unusual to others, are not in themselves signs of autism or 
+#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
+#         mental disease labels to children who in the past would have simply been accepted as a little
+#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
+#         Due to the recent publicity surrounding autism and autis
+#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
+#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first 
+#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties 
+#         would last, took a cautious approach, prefering to save the revenue rather than investing it in 
+#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential 
+#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his 
+#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]], 
+#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M, 
+#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), 
+#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the 
+#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the 
+#         [[United Arab Emirates]]. After the Emirates gained independence in 1971, 
+#         ''')
+#         """
+
+#     n = 1
+#     print(f"Timing {n} run on 100 MB")
+#     print("Register contaminant")
+#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
+
+#     print("Clean")
+#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
+
+
+# def test_janitor_general():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+
+#     jan = Janitor(ngram_n=3)
+#     jan.register_contaminant(contaminant)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+
+#     filename = "data/saved_contam"
+#     jan.save_contamination_ngrams(filename)
+
+#     jan = Janitor(ngram_n=3)
+#     jan.load_contamination_ngrams(filename)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+
+
+# if __name__ == "__main__":
+#     test()
+#     # print_cpp()
+#     # test_cpp()
+#     # benchmark()
--- a/scripts/clean_training_data/janitor.cpp
+++ b/scripts/clean_training_data/janitor.cpp
--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
+"""
+Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
+unique documents with their unique document counts. Uses multiprocessing and very little memory
+as we stream from presorted buckets. Will use a lot of disk though.
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
+--move_dir (-move)
+    Directory to move processed 13grams too. Default: Do nothing
+--process_count (-procs)
+    Number of processes to use. Default: 4
+"""
+
+import argparse
+import glob
+import os
+from pathlib import Path
+import re
+import shutil
+
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+
+from scripts.clean_training_data.archiver import TextReader, TextArchive
+
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+
+# Multiprocessed
+def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm):  
+
+    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
+    done_file = os.path.join(processed_directory, f"ngram_bucket_processing_{bucket_id}.done")
+    if os.path.exists(done_file):
+        logger.info(f"bucket {bucket_id} already processed, skipping")
+        return
+
+    # For managing tqdm
+    file_size = os.path.getsize(bucket_file_path)
+    bucket_progress = tqdm_func(total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1)
+    current_file_position = 0
+    update_frequency = 100 * 1000000 # 100mb
+    update_counter = 0
+
+    # Iterate through and output ngrams which occur in more then 10 documents 
+    bucket = TextReader(bucket_file_path)
+
+    output_file_path = bucket_file_path + ".processed"
+    output_archive = TextArchive(output_file_path, mode="wb")
+
+    current_ngram = ""
+    current_ngram_document_ids = set()
+    for line in bucket.read():
+        [ngram, document_id] = line.rsplit(" ", 1)
+
+        # Write ngram if more then 10 unique document occurences
+        if ngram != current_ngram:
+            if len(current_ngram_document_ids) > 10:
+                output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+            current_ngram = ngram
+            current_ngram_document_ids = set()
+
+        current_ngram_document_ids.add(document_id)
+
+        # Update tqdm
+        update_counter += bucket.fh.tell() - current_file_position
+        current_file_position = bucket.fh.tell()
+        if update_counter > update_frequency:
+            bucket_progress.update(update_counter)
+            update_counter = 0
+
+    # Remainder
+    if len(current_ngram_document_ids) > 10:
+        output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+
+    output_archive.commit()
+    Path(done_file).touch()
+
+    if move_dir:
+        shutil.move(output_file_path, move_dir)
+
+    global_tqdm.update()
+
+def process_sorted_buckets(working_directory, move_dir, process_count):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
+    processed_directory = os.path.join(working_directory, "processed")
+    os.makedirs(processed_directory, exist_ok=True)
+
+    pool = TqdmMultiProcessPool(process_count) 
+    tasks = [(process_bucket, (bucket_file, processed_directory, move_dir)) for bucket_file in bucket_file_paths]
+
+    global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
+    on_done = lambda _ : None
+    on_error = lambda _ : None
+    _ = pool.map(global_tqdm, tasks, on_error, on_done)
+
+parser = argparse.ArgumentParser(description='Process 13 grams from sorted buckets.')
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-move", "--move_dir", default="")
+parser.add_argument("-procs", "--process_count", type=int, default=4)
+
+if __name__ == '__main__':
+
+    logfile_path = "process13grams.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
\ No newline at end of file
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
+"""
+Iteratively runs gnu sort on each bucket, gnu handles the multiprocessing.
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
+    directory and the unsorted buckets are removed after.
+"""
+
+import glob
+import argparse
+import os
+from pathlib import Path
+import signal
+from signal import SIGINT
+import re
+import subprocess
+
+from tqdm import tqdm
+
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+
+terminate = False
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+
+def sort_13_gram_buckets(working_directory):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt")) 
+
+    for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
+        bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
+        done_file = os.path.join(working_directory, f"ngram_bucket_sorting_{bucket_id}.done")
+        if os.path.exists(done_file):
+            logger.info(f"bucket {bucket_id} already processed, skipping")
+            return
+
+        sorted_file_path = bucket_file_path + ".sorted"
+        command = f"sort {bucket_file_path} > {sorted_file_path}"
+        logger.info(command)    
+        subprocess.call(command, shell=True)
+
+        if terminate:
+            return
+
+        Path(done_file).touch()
+        os.remove(bucket_file_path)
+
+parser = argparse.ArgumentParser(description='sort 13gram buckets')
+parser.add_argument("-dir", "--working_directory", default="")
+
+if __name__ == '__main__':
+
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+
+    logfile_path = "sort13grambuckets.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    sort_13_gram_buckets(args.working_directory)
\ No newline at end of file
--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
+import transformers
+
+import torch
+import torch.nn.functional as F
+import random
+
+random.seed(42)
+
+
+data = [
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
+    "Multilayer perceptrons are sometimes colloquially referred to as \"vanilla\" neural networks, especially when they have a single hidden layer.[1]",
+    "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
+    "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
+    "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
+    "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.",
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "Hello World",
+]
+
+
+model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
+tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+
+tgs = []
+
+for dat in data:
+    random.seed(dat)
+    #print(model(tok.encode(dat, return_tensors="pt"))[0][0])
+
+    toks = tok.encode(dat, return_tensors="pt")
+    ind = random.randrange(len(toks[0])-1)
+    logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]
+
+    res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
+
+    tgs.append( float(res[ind:].sum()))
+    print(r'("""' + tok.decode(toks[0, :ind+1]) + r'""", """' + tok.decode(toks[0, ind+1:]) + r'"""), ')
+
+print(tgs)
\ No newline at end of file
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,7 +2,7 @@ from lm_eval import tasks
 from pytablewriter import MarkdownTableWriter

 writer = MarkdownTableWriter()
-writer.headers = ["Task Name", "Train", "Val", "Test", "Metrics"]
+writer.headers = ["Task Name", "Train", "Val", "Test","Val/Test Docs", "Metrics"]

 values = []

@@ -15,7 +15,9 @@ def chk(tf):
 for tname, Task in tasks.TASK_REGISTRY.items():
    task = Task()

-    values.append([tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()),', '.join(task.aggregation().keys())])
+    v = [tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()), len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),', '.join(task.aggregation().keys())]
+    print(v)
+    values.append(v)

 writer.value_matrix = values


--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

 setuptools.setup(
-    name="lm_eval_harness",
+    name="lm_eval",
    version="0.0.1",
    author="Leo Gao",
    author_email="lg@eleuther.ai",
@@ -19,4 +19,28 @@ setuptools.setup(
        "Operating System :: OS Independent",
    ],
    python_requires='>=3.6',
+    install_requires=[
+        "black==20.8b1",
+        "best_download>=0.0.6",
+        "datasets>=1.2.1",
+        "click>=7.1",
+        "scikit-learn>=0.24.1",
+        "torch>=1.7",
+        "transformers>=4.1",
+        "sqlitedict==1.6.0",
+        "pytablewriter==0.58.0",
+        "sacrebleu==1.5.0",
+        "pycountry==20.7.3",
+        "numexpr==2.7.2",
+        "lm_dataformat==0.0.19",
+        "pytest==6.2.3",
+        "pybind11==2.6.2",
+        "tqdm-multiprocess==0.0.11",
+        "zstandard==0.15.2",
+        "jsonlines==2.0.0",
+        "mock==4.0.3",
+        "openai==0.6.4",
+        "jieba==0.42.1",
+        "nagisa==0.2.7"
+    ]
 )
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
+import os
+import lm_eval.base as base
 import lm_eval.tasks as tasks
 import lm_eval.models as models
 import lm_eval.evaluator as evaluator
@@ -11,10 +13,13 @@ import pytest
 @pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
 def test_evaluator(taskname, Task):
    task_dict = tasks.get_task_dict([taskname])
-    lm = models.get_model('dummy')()
+
+    os.system("rm test_cache.db")
+    lm = base.CachingLM(models.get_model('dummy')(), "test_cache.db")

    def ll_fn(reqs):
        for ctx, cont in reqs:
+            if len(ctx) == 0: continue
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
@@ -26,7 +31,24 @@ def test_evaluator(taskname, Task):
            res.append((-random.random(), False))

        return res
-        
+
+    def ll_perp_fn(reqs):
+        for string, in reqs:
+            assert isinstance(string, str)
+
+        res = []
+        random.seed(42)
+        for _ in reqs:
+            res.append(-random.random())
+
+        return res

    lm.loglikelihood = ll_fn
-    evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
+    lm.loglikelihood_rolling = ll_perp_fn
+
+    limit = 10
+    e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+
+    # check taht caching is working
+    assert e1 == e2
--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
+import os
+from collections import Counter
+import shutil
+import glob
+
+from scripts.clean_training_data.janitor import *
+from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
+from scripts.clean_training_data.archiver import Archive, TextReader
+
+
+def test_generate_13_grams_1():
+    data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae. 
+    This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese). 
+    Some other birds, mostly related to the shelducks, have "goose" as part of their names. 
+    More distantly related members of the family Anatidae are swans, most of which are larger 
+    than true geese, and ducks, which are smaller. The term "goose" may refer to either a male 
+    or female bird, but when paired with "gander", refers specifically to a female one (the latter referring 
+    to a male). Young birds before fledging are called goslings. The collective noun for a group of 
+    geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when 
+    flying close together, they are called a plump."""
+
+    data = data + data
+
+    # Simple Generation
+    n = 13
+    janitor = Janitor()    
+    ngrams = word_ngrams(janitor.normalize_string(data), n)
+    comparison = list(ngrams)
+    comparison_counter = Counter(comparison)
+    print(len(comparison))
+    # print(comparison)
+
+    # Generating into buckets
+    test_working_directory = "test_generate_13_grams"
+    output_directory = os.path.join(test_working_directory, "output")        
+    try:
+        shutil.rmtree(output_directory)
+    except FileNotFoundError:
+        pass
+    os.makedirs(test_working_directory, exist_ok=True)
+    archive = Archive(os.path.join(test_working_directory, "test.jsonl.zst"))
+    archive.add_data(data)
+    archive.commit()
+    bucket_count = 4
+    do_ngrams_in_buckets(n, test_working_directory, bucket_count)
+
+    # Rebuild from buckets
+    rebuilt_ngrams = []
+
+    bucket_file_paths = glob.glob(os.path.join(test_working_directory, "output", f"*.bkt.txt")) 
+    for bucket_file_path in bucket_file_paths:
+        reader = TextReader(bucket_file_path)
+        
+        for line in reader.read():
+            [ngram, document_id] = line.rsplit(" ", 1)
+            rebuilt_ngrams.append(ngram)
+
+    # Compare
+    result_counter = Counter(rebuilt_ngrams)
+    # print(len(result_counter))
+    # print(len(comparison_counter))
+    assert(len(result_counter) == len(comparison_counter))
+    # print(result_counter)
+    # print(comparison_counter)    
+    assert(comparison_counter == result_counter)
\ No newline at end of file
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
+import lm_eval.tasks as tasks
+import lm_eval.models as models
+import lm_eval.evaluator as evaluator
+import random
+import pytest
+import os
+import json
+import openai
+import mock
+import pickle
+import hashlib
+
+os.environ['OPENAI_API_SECRET_KEY'] = ""
+
+
+def completion(**kwargs):
+    hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
+    fname = f"tests/testdata/gpt3_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, 'rb') as fh:
+            return pickle.load(fh)
+    ret = openai.Completion.create(**kwargs)
+    with open(fname, 'wb') as fh:
+        pickle.dump(ret, fh)
+    return ret
+
+
+os.makedirs("tests/testdata", exist_ok=True)
+
+
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
+def test_gpt3():
+    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
+        ('The quick brown fox jumps over the lazy', ' dog'),
+        ('The quick brown fox jumps over the lazy', ' cat'),
+        ('The quick brown fox jumps over the lazy', ', lazy dog'),
+        ('The quick brown fox jumps over the lazy', ', lazy fox'),
+        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
+        
+        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), 
+        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
+        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
+        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
+        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
+        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
+        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+        ("""Hello""", """ World"""), 
+    ])
+
+    assert ll_dog > ll_cat
+    assert not ig_cat
+
+    assert ig_dog
+    assert not ll_max_0
+    assert not ll_max_1
+    assert not ll_max_2
+
+    # test empty context
+    gpt3.loglikelihood([('', 'test')])
+
+    gen, = gpt3.greedy_until([
+        ('The quick brown fox jumps over the lazy', ['.', '\n'])
+    ])
+
+    assert gen == ' dog'
+
+    print([x[0] for x in vals])
+
+    targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
+
+    for (pred, _), tgt in zip(vals, targets):
+        assert pred == pytest.approx(tgt, rel=1e-3)
+
+
+
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
+def test_gpt3_perplexity():
+    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
+    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+    tgt = -84.38819608
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
+
+    # Hack: modify gpt3 to have shorter context length to induce rolling windows
+    gpt3.MAX_LENGTH = 5
+    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+    tgt = -101.93490880000002
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
+import re
+from collections import defaultdict
+
+from scripts.clean_training_data.janitor import *
+
+def simple_ngram(sequence, n):
+    ngrams = list()
+    ngram = []
+    for x in sequence:
+        ngram.append(x)
+        if len(ngram) == n:
+            ngrams.append(tuple(ngram))
+            ngram = ngram[1:]
+
+    return ngrams
+
+
+def test_form_ngrams():
+    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+
+    n_values = [1, 2, 3, 5, 13]
+    for n in n_values:
+        comparison = simple_ngram(sequence, n)
+        result_to_test = list(form_ngrams(iter(sequence), n))
+        assert len(comparison) == len(result_to_test)
+        assert comparison == result_to_test
+
+def test_word_ngrams():
+    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+
+    words = sequence.split()
+
+    n_values = [1, 2, 3, 5, 13]
+    for n in n_values:
+        comparison = simple_ngram(words, n)
+        comparison = [" ".join(ngram) for ngram in comparison]
+        result_to_test = list(word_ngrams(sequence, n))
+        assert len(comparison) == len(result_to_test)
+        assert result_to_test == comparison
+
+def test_split_indices():
+    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+
+    comparison = []
+    current_word = ""
+    for i, c in enumerate(sequence):
+        if c != " ":
+            current_word += c
+        else:
+            if current_word:
+                comparison.append((current_word, (i - len(current_word), i - 1)))
+                current_word = ""
+
+    if current_word:
+        comparison.append((current_word, (len(sequence) - len(current_word), len(sequence) - 1)))
+        current_word = ""        
+
+    result_to_test = list(split_indices(sequence))
+    assert len(comparison) == len(result_to_test)
+    assert(comparison == result_to_test)
+
+def test_word_ngrams_indices():
+
+    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+               " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+
+    n_values = [1, 2, 3, 5, 13]
+
+    for n in n_values:
+        ngrams = [" ".join(ngram) for ngram in simple_ngram(sequence.split(), n)]
+        tracker = defaultdict(int)
+        comparison = []
+        for ngram in ngrams:
+            while True:
+                start = sequence.find(ngram, tracker[ngram])
+                assert start != -1 # testing the test
+
+                end = start + len(ngram) - 1
+                tracker[ngram] = end + 1
+
+                # ignore partial word matches
+                if (start != 0 and sequence[start - 1] != " ") or \
+                   (end != len(sequence) - 1 and sequence[end + 1] != " "):
+                    pass
+                else:
+                    break
+
+            comparison.append((ngram, (start, end)))
+
+        result_to_test = list(word_ngrams_indices(sequence, n)) 
+        assert len(result_to_test) == len(comparison)
+        assert result_to_test == comparison
+
+# Assumptions from GPT3 Paper:
+# the 200 characters to remove include punctuation and is actually a half-window
+
+# All tests below initially test without any registered contaminants, expecting the same sequence back.
+def test_janitor1():
+
+    # First test using a 1gram and expected the first block before the filth to have some remaining 
+    # characters, but the second block should be completely removed.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filth = "filth"
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing "
+
+    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == {filth}
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result) 
+    assert result == expected_result
+
+def test_janitor2():
+
+    # Second test using a 1gram and expected the first block before the filth to have some remaining 
+    # characters, and the second block is longer then 200 characters so should also have some remaining.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+
+    filth = "filth"
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing " \
+                      " characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == {filth}
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+def test_janitor3():
+
+    # Same test as above but with a 6gram.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filth = "filth lots of dirty filthy filth"
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing " \
+                      " characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == {filth}
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+def test_janitor4():
+
+    # This test adds another block to that from the previous. The middle block should be entirely
+    # removed as the 200 characters are removed from each side.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filth = "filth lots of dirty filthy filth"
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing " \
+                      " characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == {filth}
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+def test_janitor5():
+
+    # Same as above but using multiple different filth 6grams.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing " \
+                      " characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    for filth in filths: 
+        janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == set(filths)
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+def test_janitor6():
+
+    # Same as above but now we add 10 filths and expect the same result, the following test does 11.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
+
+    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing " \
+                      " characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+                      "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    for filth in filths: 
+        janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == set(filths)
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+def test_janitor7():
+
+    # Same as above but now we add 9 filths and expect the same result, the following test does 10.
+
+    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "FILTH. lots of dirty filtHy FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "FILTH. lots of filtHy dirty FIlTh " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+               "This is a @line #containing a certain number of characters, 76 to be exact. "
+
+
+    filths = ["filth lots of dirty filthy filth",  "filth lots of filthy dirty filth"]
+
+    expected_result = ""
+
+    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == sequence
+
+    for filth in filths: 
+        janitor.register_contaminant(filth)
+    assert janitor.dirt_ngrams == set(filths)
+
+    result = janitor.clean_python(sequence)
+    result = "".join(result)
+    assert result == expected_result
+
+
+def test_janitor8():
+    # This will test the save and load contams
+    pass
+    # source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+    # contaminant = "dirty boy. Clean he he"
+
+    # jan = Janitor(ngram_n=3)
+    # jan.register_contaminant(contaminant)
+    # cleaned = " ".join(jan.clean(source))
+    # for contam in jan.dirt_ngrams:
+    #     assert contam not in cleaned, contam
+
+    # filename = "data/saved_contam"
+    # jan.save_contamination_ngrams(filename)
+
+    # jan = Janitor(ngram_n=3)
+    # jan.load_contamination_ngrams(filename)
+    # cleaned = " ".join(jan.clean(source))
+    # for contam in jan.dirt_ngrams:
+    #     assert contam not in cleaned, contam
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
+import pytest
+import lm_eval.metrics as metrics
+import random
+
+
+def test_bootstrapping():
+    random.seed(42)
+    arr = [random.random() for _ in range(1000)]
+    expected = metrics.mean_stderr(arr)
+    bootstrapped = metrics.bootstrap_stderr(metrics.mean, arr, iters=100000)
+
+    assert bootstrapped == pytest.approx(expected, abs=1e-4)
--- a/tests/test_models.py
+++ b/tests/test_models.py
+import pytest
 import lm_eval.models as models


 def test_gpt2():
    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
-    (ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
+    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt2.loglikelihood([
        ('The quick brown fox jumps over the lazy', ' dog'),
        ('The quick brown fox jumps over the lazy', ' cat'),
+        ('The quick brown fox jumps over the lazy', ', lazy dog'),
+        ('The quick brown fox jumps over the lazy', ', lazy fox'),
+        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
+        
+        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), 
+        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
+        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
+        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
+        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
+        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
+        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+        ("""Hello""", """ World"""), 
    ])

    assert ll_dog > ll_cat
    assert not ig_cat

+    assert not ll_max_0
+    assert ll_max_1
+    assert ll_max_2
+
    # test empty context
    gpt2.loglikelihood([('', 'test')])

@@ -18,4 +36,24 @@ def test_gpt2():
        ('The quick brown fox jumps over the lazy', ['.', '\n'])
    ])

-    assert gen == ', lazy fox and they both fall to the ground'
\ No newline at end of file
+    assert gen == ', lazy fox and they both fall to the ground'
+
+    targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
+
+    for (pred, _), tgt in zip(vals, targets):
+        assert pred == pytest.approx(tgt, rel=1e-3)
+
+
+
+def test_gpt2_perplexity():
+    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
+    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
+    tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
+
+    # Hack: modify gpt2 to have shorter context length to induce rolling windows
+    gpt2.max_length = 5
+    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
+    tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -22,13 +22,19 @@ def test_basic_interface(taskname, Task):

    for v in task.higher_is_better().values(): assert v in [True, False]

+    assert isinstance(task.VERSION, int)
+
    # test deterministic docs
    # (don't test train because it's slow)

    task2 = Task()
+
+    limit = None
+
+    if taskname in ["triviaqa"]: limit = 10000
    if task.has_validation_docs():
-        arr = list(islice(task.validation_docs(), 100))
-        arr2 = list(islice(task2.validation_docs(), 100))
+        arr = list(islice(task.validation_docs(), limit))
+        arr2 = list(islice(task2.validation_docs(), limit))

        assert arr == arr2

@@ -38,8 +44,8 @@ def test_basic_interface(taskname, Task):
        assert reqs == reqs2

    if task.has_test_docs():
-        arr = list(islice(task.test_docs(), 100))
-        arr2 = list(islice(task2.test_docs(), 100))
+        arr = list(islice(task.test_docs(), limit))
+        arr2 = list(islice(task2.test_docs(), limit))

        assert arr == arr2

@@ -48,6 +54,16 @@ def test_basic_interface(taskname, Task):
        
        assert reqs == reqs2

+    if task.has_training_docs():
+        arr = list(islice(task.training_docs(), limit))
+        arr2 = list(islice(task2.training_docs(), limit))
+
+        assert arr == arr2
+
+        reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
+        
+        assert reqs == reqs2


 @pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
@@ -57,7 +73,7 @@ def test_documents_and_requests(taskname, Task):
    fns = []
    if task.has_training_docs(): fns.append(task.training_docs)
    if task.has_validation_docs(): fns.append(task.validation_docs)
-    # test doce might not have labels
+    # test doc might not have labels
    #if task.has_test_docs(): fns.append(task.test_docs)

    for fn in fns:
@@ -71,8 +87,10 @@ def test_documents_and_requests(taskname, Task):
            assert isinstance(tgt, str)
            
            # space convention
-            assert txt[-1] != ' '
-            assert tgt[0] == ' ' or txt[-1] == '\n'
+            # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+            if len(txt) != 0:
+                assert txt[-1] != ' '
+                assert tgt[0] == ' ' or txt[-1] == '\n'

            reqs = task.construct_requests(doc, txt)
            

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
+from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v1():
+    gold = [
+        ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+        ([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [30, 31, 32, 33]),
+    ]
+    x = list(range(34))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=10,
+        context_len=1,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v2():
+    gold = [
+        ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [10, 11, 12]),
+        ([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [13, 14, 15]),
+        ([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [16, 17, 18]),
+        ([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [19, 20, 21]),
+        ([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [22, 23, 24]),
+        ([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [25, 26, 27]),
+        ([20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [28, 29, 30]),
+        ([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [31, 32, 33]),
+    ]
+    x = list(range(34))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=10,
+        context_len=8,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v3():
+    gold = [
+        ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10]),
+        ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11]),
+        ([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12]),
+        ([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [13]),
+        ([4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14]),
+        ([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [15]),
+        ([6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16]),
+        ([7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [17]),
+        ([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [18]),
+        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [19]),
+        ([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20]),
+        ([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21]),
+        ([12, 13, 14, 15, 16, 17, 18, 19, 20, 21], [22]),
+        ([13, 14, 15, 16, 17, 18, 19, 20, 21, 22], [23]),
+        ([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24]),
+        ([15, 16, 17, 18, 19, 20, 21, 22, 23, 24], [25]),
+        ([16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [26]),
+        ([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [27]),
+        ([18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [28]),
+        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29]),
+        ([20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [30]),
+        ([21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [31]),
+        ([22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32]),
+        ([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [33]),
+    ]
+    x = list(range(34))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=10,
+        context_len=10,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v4():
+    gold = [
+        ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10]),
+        ([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11]),
+        ([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12]),
+        ([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [13]),
+        ([4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14]),
+        ([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [15]),
+        ([6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16]),
+        ([7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [17]),
+        ([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [18]),
+        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [19]),
+        ([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20]),
+        ([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21]),
+        ([12, 13, 14, 15, 16, 17, 18, 19, 20, 21], [22]),
+        ([13, 14, 15, 16, 17, 18, 19, 20, 21, 22], [23]),
+        ([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24]),
+        ([15, 16, 17, 18, 19, 20, 21, 22, 23, 24], [25]),
+        ([16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [26]),
+        ([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [27]),
+        ([18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [28]),
+        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29]),
+
+    ]
+    x = list(range(30))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=10,
+        context_len=10,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v5():
+    gold = [
+        ([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
+        ([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
+    ]
+    x = list(range(30))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=10,
+        context_len=1,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+# noinspection DuplicatedCode
+def test_get_rolling_token_windows_v6():
+    gold = [
+        ([-100, 0], [0, 1]),
+        ([1, 2], [2, 3]),
+        ([3, 4], [4, 5]),
+        ([5, 6], [6, 7]),
+        ([6, 7], [8]),
+    ]
+    x = list(range(9))
+    generator = get_rolling_token_windows(
+        token_list=x,
+        prefix_token=-100,
+        max_seq_len=2,
+        context_len=1,
+    )
+    pred_length = 0
+    output = []
+    for input_tokens, pred_tokens in generator:
+        output.append((input_tokens, pred_tokens))
+        pred_length += len(pred_tokens)
+    assert pred_length == len(x)
+    assert gold == output
+
+
+def test_get_rolling_token_windows_empty():
+    generator = get_rolling_token_windows(
+        token_list=[],
+        prefix_token=-100,
+        max_seq_len=2,
+        context_len=1,
+    )
+    n = 0
+    for _ in generator:
+        n += 1
+    assert n == 0
+
+
+def test_make_disjoint_window():
+    assert make_disjoint_window(([1,2,3,4,5], [2,3,4,5,6])) == ([1], [2,3,4,5,6])
+    assert make_disjoint_window(([1,2,3,4,5], [4,5,6])) == ([1,2,3], [4,5,6])
\ No newline at end of file
--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
+import lm_eval.tasks as tasks
+import lm_eval.models as models
+import lm_eval.evaluator as evaluator
+import random
+import pytest
+import os
+import json
+import hashlib
+
+
+os.makedirs("tests/testdata", exist_ok=True)
+
+
+def assert_target(name, ob):
+    fname = f"tests/testdata/{name}.json"
+    if os.path.exists(fname):
+        with open(fname) as fh:
+            assert json.load(fh) == json.loads(json.dumps(ob, sort_keys=True))
+    else:
+        with open(fname, 'w') as fh:
+            json.dump(ob, fh, sort_keys=True)
+
+def assert_target_hashed(name, ob):
+    fname = f"tests/testdata/{name}"
+    if os.path.exists(fname):
+        with open(fname) as fh:
+            assert fh.read() == hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest()
+    else:
+        with open(fname, 'w') as fh:
+            fh.write(hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest())
+
+
+# make sure eval results for a task version are stable
+
+@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+def test_versions_stable(taskname, Task):
+    task_dict = tasks.get_task_dict([taskname])
+    lm = models.get_model('dummy')()
+
+    def ll_fn(reqs):
+        for ctx, cont in reqs:
+            if len(ctx) == 0: continue
+            # space convention
+            assert ctx[-1] != ' '
+            assert cont[0] == ' ' or ctx[-1] == '\n'
+        
+        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
+        res = []
+        
+        random.seed(42)
+        for _ in reqs:
+            res.append((-random.random(), False))
+
+        return res
+
+    def ll_perp_fn(reqs):
+        for string, in reqs:
+            assert isinstance(string, str)
+
+        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
+        res = []
+
+        random.seed(42)
+        for _ in reqs:
+            res.append(-random.random())
+
+        return res
+    
+    def greedy_until(reqs):
+        res = []
+        assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
+        
+        for ctx, _ in reqs:
+            res.append("lol")
+            assert ctx.strip() != ''
+
+        return res
+
+    lm.loglikelihood = ll_fn
+    lm.loglikelihood_rolling = ll_perp_fn
+    lm.greedy_until = greedy_until
+
+    limit = None
+    res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    assert_target(f"{taskname}-v{Task.VERSION}-res", res)
--- a/tests/testdata/anagrams1-v0-greedy_until
+++ b/tests/testdata/anagrams1-v0-greedy_until
+7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1
\ No newline at end of file
--- a/tests/testdata/anagrams1-v0-res.json
+++ b/tests/testdata/anagrams1-v0-res.json
+{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
\ No newline at end of file
--- a/tests/testdata/anagrams2-v0-greedy_until
+++ b/tests/testdata/anagrams2-v0-greedy_until
+6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7
\ No newline at end of file