Merge pull request #317 from EleutherAI/Mistobaan/add-pre-commit

Add pre-commit

Merge pull request #317 from EleutherAI/Mistobaan/add-pre-commit
Add pre-commit
a2cada5d · Jonathan Tow · GitHub · 7a038118 · 83507c4b · a2cada5d
Unverified Commit a2cada5d authored May 03, 2022 by Jonathan Tow Committed by GitHub May 03, 2022
20 changed files
--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
 1) Collects all contamination text files that are to be removed from training data
 2) Filters training data by finding `N`gram matches between the training data
   and any contamination
-   1) `N`grams ignore case and punctation and are split on whitespace.  
+   1) `N`grams ignore case and punctuation and are split on whitespace.
   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
@@ -20,7 +20,7 @@ minimum_slice_length = 200
 too_dirty_cutoff = 10
 ```

-## Compling
+## Compiling

 Janitor can be used as a pure python program, but it is much faster if the ngram
 code is run in C++. To compile the C++ code, run
@@ -31,4 +31,3 @@ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor
 ```

 If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
-
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
@@ -9,9 +9,13 @@ from tqdm_multiprocess import TqdmMultiProcessPool

 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)

-def process_task(working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm):
+
+def process_task(
+    working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
+):
    command = f"zstd {bucket_file_path}"
    logger.info(command)
    subprocess.call(command, shell=True)
@@ -23,32 +27,42 @@ def process_task(working_directory, output_directory, bucket_file_path, tqdm_fun
    os.remove(bucket_file_path)
    global_tqdm.update()

+
 def compress_and_move(working_directory, output_directory, process_count):
    os.makedirs(output_directory, exist_ok=True)
    original_info_file_path = os.path.join(working_directory, "info.json")
-    assert(os.path.exists(original_info_file_path))
+    assert os.path.exists(original_info_file_path)

    tasks = []
-    bucket_file_paths = glob.glob(os.path.join(working_directory, "output", f"*.bkt.txt.sorted")) 
+    bucket_file_paths = glob.glob(
+        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+    )
    for bucket_file_path in bucket_file_paths:
        task = (process_task, (working_directory, output_directory, bucket_file_path))
        tasks.append(task)

    pool = TqdmMultiProcessPool(process_count)
-    on_done = lambda _ : None
-    on_error = lambda _ : None

-    global_progress = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="file")
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    global_progress = tqdm(
+        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
+    )
    _ = pool.map(global_progress, tasks, on_error, on_done)

    shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))

-parser = argparse.ArgumentParser(description='sort 13gram buckets')
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
 parser.add_argument("-dir", "--working_directory", required=True)
 parser.add_argument("-output", "--output_directory", required=True)
 parser.add_argument("-procs", "--process_count", type=int, default=8)

-if __name__ == '__main__':
+if __name__ == "__main__":
    version = 1.00
    print(f"Running version {version}")


--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -37,18 +37,24 @@ from lm_eval.decontamination.archiver import TextArchive, Reader

 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)

 terminate = False
+
+
 def handler(signal_received, frame):
    global terminate
    terminate = True

+
 def yield_pile(start_offsets=None, checkpoint_offset=None):
    directory = "pile"

    if not os.path.exists(directory):
-        print("We expect the pile archives to be in the 'pile' directory, but this was not found.")
+        print(
+            "We expect the pile archives to be in the 'pile' directory, but this was not found."
+        )
        raise Exception("Pile directory not found.")

    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
@@ -63,7 +69,6 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
            start_file = file_i
            pile_global_offset = start_offset

-    
    for file_i, file in enumerate(files):
        if file_i < start_file:
            logger.info(f"Skipping file {file}")
@@ -74,12 +79,15 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
            yield (pile_global_offset, document)
            pile_global_offset += 1

+
 # Hash buckets > disk backed files. Supports file position checkpointing and resuming
 # Allows you to write continuously and checkpoint intermittently. If a failure occurs
 # the buckets are simply truncated at your last checkpoint.
 class Buckets:
    def __init__(self, directory, num_buckets):
-        self.bucket_files = [os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)]
+        self.bucket_files = [
+            os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
+        ]
        self.buckets = list(map(TextArchive, self.bucket_files))
        self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt")

@@ -109,6 +117,7 @@ class Buckets:
        for bucket in self.buckets:
            bucket.commit()

+
 def do_ngrams_in_buckets(n_value, working_directory, bucket_count):

    pile_statistics = json.load(open("pile_statistics.json", "r"))
@@ -129,7 +138,7 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
    # Checkpoint
    checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt")
    if os.path.exists(checkpoint_file):
-        checkpoint_offset = pickle.load(open(checkpoint_file,"rb"))
+        checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
        iterate = True
    else:
        checkpoint_offset = 0
@@ -165,7 +174,7 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
                progress.update(batch_size)
                batch_counter = 0
                buckets.save_checkpoint()
-                pickle.dump(offset, open(checkpoint_file,"wb"))
+                pickle.dump(offset, open(checkpoint_file, "wb"))
                if terminate:
                    buckets.close_buckets()
                    return
@@ -180,12 +189,12 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
    Path(done_file).touch()


-parser = argparse.ArgumentParser(description='Generate 13 grams from Pile.')
+parser = argparse.ArgumentParser(description="Generate 13 grams from Pile.")
 parser.add_argument("-dir", "--working_directory", default="")
 parser.add_argument("-n", "--n_value", type=int, default=13)
 parser.add_argument("-buckets", "--bucket_count", type=int, default=500)

-if __name__ == '__main__':
+if __name__ == "__main__":
    version = 1.00
    print(f"Running version {version}")


--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -7,6 +7,7 @@ import tqdm

 from tqdm_multiprocess import TqdmMultiProcessPool

+
 def get_file_stats(file_path, tqdm_func, global_tqdm):
    reader = Reader()
    total_documents = 0
@@ -14,7 +15,9 @@ def get_file_stats(file_path, tqdm_func, global_tqdm):
    update_frequency = 10000
    current_file_position = 0

-    with tqdm_func(total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1) as progress:
+    with tqdm_func(
+        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
+    ) as progress:
        for document in reader.read(file_path, get_meta=True):
            total_size += len(document)
            total_documents += 1
@@ -28,27 +31,37 @@ def get_file_stats(file_path, tqdm_func, global_tqdm):

    return (total_documents, total_size)

+
 def get_files():
    directory = "pile"
    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
    print(files)
    return files

+
 def get_stats():
    files = get_files()
    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))

    pool = TqdmMultiProcessPool(4)
-    global_tqdm = tqdm.tqdm(total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1)
+    global_tqdm = tqdm.tqdm(
+        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
+    )

    # Generate minhashes with pool
    tasks = [(get_file_stats, (file,)) for file in files]

-    on_done = lambda _ : None
-    on_error = lambda _ : None
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
    results = pool.map(global_tqdm, tasks, on_error, on_done)

-    total_documents, total_size = reduce(lambda x, y: (x[0]+y[0],x[1]+y[1]), results)
+    total_documents, total_size = reduce(
+        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
+    )

    start_offsets = []
    current_offset = 0
@@ -58,7 +71,8 @@ def get_stats():

    return (total_documents, total_size, start_offsets)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    version = 1.01
    print(f"Running version {version}")

@@ -67,10 +81,11 @@ if __name__ == '__main__':
        stats = json.load(open(stats_file_path, "r"))
    else:
        document_count, total_document_size_chars, start_offsets = get_stats()
-        stats = {"Data": "Pile statistics",
+        stats = {
+            "Data": "Pile statistics",
            "Document Count": document_count,
            "Total Pile Characters": total_document_size_chars,
-                 "File Start Offsets": start_offsets
+            "File Start Offsets": start_offsets,
        }
        json.dump(stats, open(stats_file_path, "w"), indent=4)


--- a/scripts/clean_training_data/janitor_util.cpp
+++ b/scripts/clean_training_data/janitor_util.cpp
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include <utility>
+#include <queue>
 #include <string>
-#include <vector>
 #include <tuple>
-#include <queue>
+#include <utility>
+#include <vector>

 bool is_whitespace(char ch) noexcept {
  // " \t\n\r\x0b\x0c" (python string.whitespace)
  return ch == 32 or (9 <= ch and ch <= 13);
-//    return ch <= 32; // arguably too general, but slightly faster
+  //    return ch <= 32; // arguably too general, but slightly faster
 }

 bool is_punctuation(char c) noexcept {
-    // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,  91-96,  123-126
-    return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126);
+  // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,
+  // 91-96,  123-126
+  return (33 <= c and c <= 47) or (58 <= c and c <= 64) or
+         (91 <= c and c <= 96) or (123 <= c and c <= 126);
 }

-// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
-// Returns a LARGE array of ngrams
-std::vector<std::string> clean_ngram(
-    std::string const & input, std::string const & ignore, size_t ngram_n
-) noexcept {
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of ngrams
+std::vector<std::string> clean_ngram(std::string const &input,
+                                     std::string const &ignore,
+                                     size_t ngram_n) noexcept {

  size_t num_grams = 0;
  std::vector<std::string> ngram_list;
@@ -29,43 +31,47 @@ std::vector<std::string> clean_ngram(
  std::string current_ngram;

  // Max gram length is set to 10 below.
-    current_ngram.reserve(11*ngram_n);
+  current_ngram.reserve(11 * ngram_n);
  gram_lengths.reserve(ngram_n);

  bool started_gram = false;
  gram_lengths.push_back(0);

-    //for (size_t i=0; i<input.length(); i++) {
+  // for (size_t i=0; i<input.length(); i++) {
  //  this is slightly faster, and we don't need the index in this one
  for (auto iter = input.begin(); iter != input.end(); iter++) {

    // If whitespace, end the current ngram and start the next
-        // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ... }
+    // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
+    // }
    if (is_whitespace(*iter) || gram_lengths.back() > 10) {

      // Skip all whitespace
-            while (++iter != input.end() && is_whitespace(*iter));
+      while (++iter != input.end() && is_whitespace(*iter))
+        ;
      iter--;

-            if (started_gram){
+      if (started_gram) {
        num_grams += 1;

        // Building 1grams is a special case
-                if (ngram_n == 1){
+        if (ngram_n == 1) {
          ngram_list.push_back(current_ngram);
          current_ngram = current_ngram.substr(gram_lengths.front());
          gram_lengths.back() = 0;

          // If there are enough grams to form an ngram, save
-                } else if (num_grams >= ngram_n){
+        } else if (num_grams >= ngram_n) {
          // Save the current ngram
          ngram_list.push_back(current_ngram);

-                    // Start the next ngram by dropping the first gram and its space from the ngram
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
          current_ngram += ' ';

-                    // Drop the length of the first gram and prepare to record the length of the new gram
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
          gram_lengths.erase(gram_lengths.begin());
          gram_lengths.push_back(0);

@@ -78,14 +84,15 @@ std::vector<std::string> clean_ngram(
        started_gram = false;
      }

-
      // Skip ignored characters
-        // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue;
+      // alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
+      // continue;
    } else if (ignore.find(*iter) != std::string::npos) {
      continue;
    }

-        // If it is a non-ignored character, add it to the ngram and update the last gram's length
+    // If it is a non-ignored character, add it to the ngram and update the last
+    // gram's length
    else {
      current_ngram += tolower(*iter);
      gram_lengths.back() += 1;
@@ -96,70 +103,73 @@ std::vector<std::string> clean_ngram(
  return ngram_list;
 }

-
-// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
-// Returns a LARGE array of tuples of (ngram, start_idx, end_idx)
-std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
-    std::string const & input, std::string const & ignore, size_t ngram_n
-) noexcept {
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of tuples of (ngram,
+// start_idx, end_idx)
+std::vector<std::tuple<std::string, size_t, size_t>>
+clean_ngram_with_indices(std::string const &input, std::string const &ignore,
+                         size_t ngram_n) noexcept {

  size_t num_grams = 0;
-    std::vector<std::tuple<std::string, size_t, size_t> > ngram_list;
+  std::vector<std::tuple<std::string, size_t, size_t>> ngram_list;
  std::vector<uint8_t> gram_lengths;
  std::vector<size_t> gram_start_indices;
  std::string current_ngram;

  // Max gram length is set to 10 below.
-    current_ngram.reserve(11*ngram_n);
+  current_ngram.reserve(11 * ngram_n);

  bool started_gram = false;
  gram_lengths.push_back(0);
  gram_start_indices.push_back(0);

-    for (size_t i=0; i<input.length(); i++) {
+  for (size_t i = 0; i < input.length(); i++) {
    char ch = input[i];

    // If whitespace, end the current ngram and start the next
    if (is_whitespace(ch) || gram_lengths.back() > 10) {

      // Skip all whitespace
-            while (++i < input.length() && is_whitespace(input[i]));
+      while (++i < input.length() && is_whitespace(input[i]))
+        ;
      i--;

-            if (started_gram){
+      if (started_gram) {
        num_grams += 1;

        // Building 1grams is a special case
-                if (ngram_n == 1){
-                    ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i));
+        if (ngram_n == 1) {
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
          current_ngram = current_ngram.substr(gram_lengths.front());
          gram_lengths.back() = 0;
-                    gram_start_indices.back() = i+1;
+          gram_start_indices.back() = i + 1;

          // If there are enough grams to form an ngram, save
-                } else if (num_grams >= ngram_n){
+        } else if (num_grams >= ngram_n) {

          // Save the current ngram
          ngram_list.push_back(
-                        std::make_tuple(current_ngram, gram_start_indices.front(), i)
-                    );
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));

-                    // Start the next ngram by dropping the first gram and its space from the ngram
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
          current_ngram += ' ';

-                    // Drop the length of the first gram and prepare to record the length of the new gram
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
          gram_lengths.erase(gram_lengths.begin());
          gram_lengths.push_back(0);

          gram_start_indices.erase(gram_start_indices.begin());
-                    gram_start_indices.push_back(i+1);
+          gram_start_indices.push_back(i + 1);

          // Otherwise, continute building
        } else {
          current_ngram += ' ';
          gram_lengths.push_back(0);
-                    gram_start_indices.push_back(i+1);
+          gram_start_indices.push_back(i + 1);
        }

        started_gram = false;
@@ -169,7 +179,8 @@ std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
    } else if (ignore.find(*iter) != std::string::npos) {
      continue;

-        // If it is a non-ignored character, add it to the ngram and update the last gram's length
+      // If it is a non-ignored character, add it to the ngram and update the
+      // last gram's length
    } else {
      current_ngram += tolower(ch);
      gram_lengths.back() += 1;
@@ -180,14 +191,18 @@ std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
  return ngram_list;
 }

-
 PYBIND11_MODULE(janitor_util, m) {
  m.doc() = "pybind11 example plugin"; // optional module docstring
-//    m.def("add", &add, "A function which adds two numbers");  // example function
-    m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters");
-    m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters");
+  //    m.def("add", &add, "A function which adds two numbers");  // example
+  //    function
+  m.def("clean_ngram", &clean_ngram,
+        "Create ngrams of words, ignoring some characters");
+  m.def("clean_ngram_with_indices", &clean_ngram_with_indices,
+        "Create ngrams of words with indices, ignoring some characters");
 }

 // Example compile
-// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
-// If python and gcc aren't linked, append to the above:    -undefined dynamic_lookup
\ No newline at end of file
+// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes)
+// janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) If
+// python and gcc aren't linked, append to the above:    -undefined
+// dynamic_lookup
--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -27,20 +27,28 @@ from scripts.clean_training_data.archiver import TextReader, TextArchive

 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)

-# Multiprocessed
-def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm):  

-    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
-    done_file = os.path.join(processed_directory, f"ngram_bucket_processing_{bucket_id}.done")
+# Multiprocessed
+def process_bucket(
+    bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
+):
+
+    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
+    done_file = os.path.join(
+        processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
+    )
    if os.path.exists(done_file):
        logger.info(f"bucket {bucket_id} already processed, skipping")
        return

    # For managing tqdm
    file_size = os.path.getsize(bucket_file_path)
-    bucket_progress = tqdm_func(total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1)
+    bucket_progress = tqdm_func(
+        total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
    current_file_position = 0
    update_frequency = 100 * 1000000  # 100mb
    update_counter = 0
@@ -56,10 +64,12 @@ def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, g
    for line in bucket.read():
        [ngram, document_id] = line.rsplit(" ", 1)

-        # Write ngram if more then 10 unique document occurences
+        # Write ngram if more then 10 unique document occurrences
        if ngram != current_ngram:
            if len(current_ngram_document_ids) > 10:
-                output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+                output_archive.add_data(
+                    f"{current_ngram} {len(current_ngram_document_ids)}"
+                )
            current_ngram = ngram
            current_ngram_document_ids = set()

@@ -84,25 +94,35 @@ def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, g

    global_tqdm.update()

+
 def process_sorted_buckets(working_directory, move_dir, process_count):
    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
    processed_directory = os.path.join(working_directory, "processed")
    os.makedirs(processed_directory, exist_ok=True)

    pool = TqdmMultiProcessPool(process_count)
-    tasks = [(process_bucket, (bucket_file, processed_directory, move_dir)) for bucket_file in bucket_file_paths]
+    tasks = [
+        (process_bucket, (bucket_file, processed_directory, move_dir))
+        for bucket_file in bucket_file_paths
+    ]

    global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
-    on_done = lambda _ : None
-    on_error = lambda _ : None
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
    _ = pool.map(global_tqdm, tasks, on_error, on_done)

-parser = argparse.ArgumentParser(description='Process 13 grams from sorted buckets.')
+
+parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.")
 parser.add_argument("-dir", "--working_directory", default="")
 parser.add_argument("-move", "--move_dir", default="")
 parser.add_argument("-procs", "--process_count", type=int, default=4)

-if __name__ == '__main__':
+if __name__ == "__main__":

    logfile_path = "process13grams.log"
    setup_logger_tqdm(logfile_path)

--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -19,13 +19,17 @@ from tqdm import tqdm

 import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
+
 logger = logging.getLogger(__name__)

 terminate = False
+
+
 def handler(signal_received, frame):
    global terminate
    terminate = True

+
 def sort_13_gram_buckets(working_directory):
    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))

@@ -40,10 +44,11 @@ def sort_13_gram_buckets(working_directory):

        os.remove(bucket_file_path)

-parser = argparse.ArgumentParser(description='sort 13gram buckets')
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
 parser.add_argument("-dir", "--working_directory", default="")

-if __name__ == '__main__':
+if __name__ == "__main__":

    version = 1.00
    print(f"Running version {version}")

--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -7,7 +7,7 @@ from lm_eval.base import LM
 class DryrunLM(LM):
    def __init__(self):
        self.tokencost = 0
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
        self.tokenizer.pad_token = "<|endoftext|>"

    @classmethod
@@ -37,7 +37,7 @@ class DryrunLM(LM):
    def loglikelihood_rolling(self, requests):
        res = []

-        for s, in requests:
+        for (s,) in requests:
            # assume worst case: extra full context
            self.tokencost += len(self.tokenizer.tokenize(s)) + 2048

@@ -57,11 +57,20 @@ def main():
            num_fewshot=0,
            limit=None,
            bootstrap_iters=10,
-            description_dict=None
+            description_dict=None,
        )

        print(taskname, lm.tokencost)
-        values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06])
+        values.append(
+            [
+                taskname,
+                lm.tokencost,
+                lm.tokencost / 1000 * 0.0008,
+                lm.tokencost / 1000 * 0.0012,
+                lm.tokencost / 1000 * 0.006,
+                lm.tokencost / 1000 * 0.06,
+            ]
+        )
    from pytablewriter import MarkdownTableWriter

    writer = MarkdownTableWriter()
@@ -69,10 +78,21 @@ def main():

    values.sort(key=lambda x: -x[1])
    totcost = sum([x[1] for x in values])
-    values.append(["**Total**", totcost, totcost / 1000 * 0.0008, totcost / 1000 * 0.0012, totcost / 1000 * 0.006, totcost / 1000 * 0.06])
+    values.append(
+        [
+            "**Total**",
+            totcost,
+            totcost / 1000 * 0.0008,
+            totcost / 1000 * 0.0012,
+            totcost / 1000 * 0.006,
+            totcost / 1000 * 0.06,
+        ]
+    )

    writer.value_matrix = values

    print(writer.dumps())
+
+
 if __name__ == "__main__":
    main()
--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
@@ -3,16 +3,21 @@ from itertools import islice

 ct = 3

-for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCoRD)]:#
+for (
+    tname,
+    Task,
+) in tasks.TASK_REGISTRY.items():  # [('record', tasks.superglue.ReCoRD)]:#
    task = Task()

-    print('#', tname)
-    docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct)
+    print("#", tname)
+    docs = islice(
+        task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct
+    )
    print()
    for i in range(ct):
        print()
        doc = next(docs)
        print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
        print()
-        print('**Target**:', "\n```\n" + task.doc_to_target(doc) + "\n```\n")
+        print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n")
        print()
--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
@@ -10,7 +10,7 @@ random.seed(42)
 data = [
    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
    "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
-    "Multilayer perceptrons are sometimes colloquially referred to as \"vanilla\" neural networks, especially when they have a single hidden layer.[1]",
+    'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]',
    "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
    "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
    "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
@@ -20,22 +20,28 @@ data = [
 ]


-model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
-tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
+tok = transformers.GPT2Tokenizer.from_pretrained("gpt2")

 tgs = []

 for dat in data:
    random.seed(dat)
-    #print(model(tok.encode(dat, return_tensors="pt"))[0][0])
+    # print(model(tok.encode(dat, return_tensors="pt"))[0][0])

    toks = tok.encode(dat, return_tensors="pt")
-    ind = random.randrange(len(toks[0])-1)
+    ind = random.randrange(len(toks[0]) - 1)
    logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]

    res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]

-    tgs.append( float(res[ind:].sum()))
-    print(r'("""' + tok.decode(toks[0, :ind+1]) + r'""", """' + tok.decode(toks[0, ind+1:]) + r'"""), ')
+    tgs.append(float(res[ind:].sum()))
+    print(
+        r'("""'
+        + tok.decode(toks[0, : ind + 1])
+        + r'""", """'
+        + tok.decode(toks[0, ind + 1 :])
+        + r'"""), '
+    )

 print(tgs)
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,20 +2,29 @@ from lm_eval import tasks
 from pytablewriter import MarkdownTableWriter

 writer = MarkdownTableWriter()
-writer.headers = ["Task Name", "Train", "Val", "Test","Val/Test Docs", "Metrics"]
+writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]

 values = []

+
 def chk(tf):
    if tf:
-        return '✓'
+        return "✓"
    else:
-        return ' '
+        return " "
+

 for tname, Task in tasks.TASK_REGISTRY.items():
    task = Task()

-    v = [tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()), len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),', '.join(task.aggregation().keys())]
+    v = [
+        tname,
+        chk(task.has_training_docs()),
+        chk(task.has_validation_docs()),
+        chk(task.has_test_docs()),
+        len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),
+        ", ".join(task.aggregation().keys()),
+    ]
    print(v)
    values.append(v)


--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -11,14 +11,14 @@ EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"

 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--output_base_path', required=True)
-    parser.add_argument('--tasks', default="all_tasks")
-    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument('--sets', type=str, default="val") # example: val,test
-    parser.add_argument('--num_fewshot', type=int, default=1)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_examples', type=int, default=1)
-    parser.add_argument('--description_dict_path', default=None)
+    parser.add_argument("--output_base_path", required=True)
+    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument("--sets", type=str, default="val")  # example: val,test
+    parser.add_argument("--num_fewshot", type=int, default=1)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument("--description_dict_path", default=None)
    return parser.parse_args()


@@ -34,7 +34,7 @@ def main():

    description_dict = {}
    if args.description_dict_path:
-        with open(args.description_dict_path, 'r') as f:
+        with open(args.description_dict_path, "r") as f:
            description_dict = json.load(f)

    os.makedirs(args.output_base_path, exist_ok=True)
@@ -45,26 +45,34 @@ def main():
        iters = []

        for set in args.sets.split(","):
-            if set == 'train' and task.has_training_docs():
+            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
-            if set == 'val' and task.has_validation_docs():
+            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
-            if set == 'test' and task.has_test_docs():
+            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
            iters.append(docs)

        docs = join_iters(iters)

-        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )

        with open(os.path.join(args.output_base_path, task_name), "w") as f:
-            for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
+            for i, doc in (
+                zip(range(args.num_examples), docs)
+                if args.num_examples > 0
+                else enumerate(docs)
+            ):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx = task.fewshot_context(
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    description=description
+                    description=description,
                )
                f.write(ctx + "\n")


--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setuptools.setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
-    python_requires='>=3.6',
+    python_requires=">=3.6",
    install_requires=[
        "datasets>=2.0.0",
        "click>=7.1",
@@ -40,10 +40,10 @@ setuptools.setup(
        "openai==0.6.4",
        "jieba==0.42.1",
        "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
    ],
    dependency_links=[
        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
    ],
-    extras_require={'dev': [ 'pytest', 'black' ]}
+    extras_require={"dev": ["pytest", "black", "pre-commit"]},
 )
--- a/templates/new_multiple_choice_task.py
+++ b/templates/new_multiple_choice_task.py
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.

 TODO: Write a Short Description of the task.


--- a/templates/new_task.py
+++ b/templates/new_task.py
 # TODO: Remove all TODO comments once the implementation is complete.
 """
 TODO: Add the Paper Title on this line.
-TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+TODO: Add the paper's PDF URL (preferably from arXiv) on this line.

 TODO: Write a Short Description of the task.

@@ -45,7 +45,7 @@ class NewTask(Task):
            if self._training_docs is None:
                # TODO: Return the training document generator from `self.dataset`.
                # If you need to process the data, `map` over the documents with
-                # the custom procesing function, `self._process_doc`. E.g.
+                # the custom processing function, `self._process_doc`. E.g.
                # `map(self._process_doc, self.dataset["validation"])`
                # In most case you can leave this as is unless the dataset split is
                # named differently than the default `"train"`.
@@ -56,7 +56,7 @@ class NewTask(Task):
        if self.has_validation_docs():
            # TODO: Return the validation document generator from `self.dataset`.
            # If you need to process the data, `map` over the documents with the
-            # custom procesing function, `self._process_doc`. E.g.
+            # custom processing function, `self._process_doc`. E.g.
            # `map(self._process_doc, self.dataset["validation"])`
            # In most case you can leave this as is unless the dataset split is
            # named differently than the default `"validation"`.

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -10,20 +10,21 @@ import pytest
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces

+
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

    os.system("rm test_cache.db")
-    lm = base.CachingLM(models.get_model('dummy')(), "test_cache.db")
+    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")

    def ll_fn(reqs):
        for ctx, cont in reqs:
            if len(ctx) == 0:
                continue
            # space convention
-            assert ctx[-1] != ' '
-            assert cont[0] == ' ' or ctx[-1] == '\n'
+            assert ctx[-1] != " "
+            assert cont[0] == " " or ctx[-1] == "\n"

        res = []

@@ -34,7 +35,7 @@ def test_evaluator(taskname, task_class):
        return res

    def ll_perp_fn(reqs):
-        for string, in reqs:
+        for (string,) in reqs:
            assert isinstance(string, str)

        res = []
@@ -54,7 +55,7 @@ def test_evaluator(taskname, task_class):
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
-            description_dict=None
+        description_dict=None,
    )
    e2 = evaluator.evaluate(
        lm=lm,
@@ -62,7 +63,7 @@ def test_evaluator(taskname, task_class):
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
-            description_dict=None
+        description_dict=None,
    )

    # check that caching is working

--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
@@ -3,13 +3,15 @@ from collections import Counter
 import shutil
 import glob

-from lm_eval.decontamination.janitor import *
+from lm_eval.decontamination.janitor import Janitor, word_ngrams
 from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
 from lm_eval.decontamination.archiver import Archive, TextReader

 import logging
+
 logger = logging.getLogger(__name__)

+
 def test_generate_13_grams_1(caplog):
    data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
    This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
@@ -42,7 +44,7 @@ def test_generate_13_grams_1(caplog):
        pass
    os.makedirs(test_working_directory)

-    assert(not os.path.exists("pile"))
+    assert not os.path.exists("pile")
    os.makedirs("pile")
    archive = Archive(os.path.join("pile", "test.jsonl.zst"))
    archive.add_data(data)
@@ -54,7 +56,9 @@ def test_generate_13_grams_1(caplog):
    # Rebuild from buckets
    print("rebuild")
    rebuilt_ngrams = []
-    bucket_file_paths = glob.glob(os.path.join(test_working_directory, "output", f"*.bkt.txt")) 
+    bucket_file_paths = glob.glob(
+        os.path.join(test_working_directory, "output", f"*.bkt.txt")
+    )
    for bucket_file_path in bucket_file_paths:
        reader = TextReader(bucket_file_path)

@@ -67,7 +71,7 @@ def test_generate_13_grams_1(caplog):
    result_counter = Counter(rebuilt_ngrams)
    # print(len(result_counter))
    # print(len(comparison_counter))
-    assert(len(result_counter) == len(comparison_counter))
+    assert len(result_counter) == len(comparison_counter)
    # print(result_counter)
    # print(comparison_counter)
-    assert(comparison_counter == result_counter)
\ No newline at end of file
+    assert comparison_counter == result_counter
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
@@ -12,40 +12,78 @@ def mock_completion(**kwargs):
    # Mock completion function
    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
    os.makedirs("tests/testdata", exist_ok=True)
-    hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
+    hash = hashlib.sha256(
+        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
    fname = f"tests/testdata/gpt3_test_{hash}.pkl"

    if os.path.exists(fname):
-        with open(fname, 'rb') as fh:
+        with open(fname, "rb") as fh:
            return pickle.load(fh)
    ret = openai.Completion.create(**kwargs)
    ret.api_key = ""
-    with open(fname, 'wb') as fh:
+    with open(fname, "wb") as fh:
        pickle.dump(ret, fh)
    return ret


 @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3():
-    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
-    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
-        ('The quick brown fox jumps over the lazy', ' dog'),
-        ('The quick brown fox jumps over the lazy', ' cat'),
-        ('The quick brown fox jumps over the lazy', ', lazy dog'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
-        
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
-        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
-        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
-        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
-        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
-        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
-        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = gpt3.loglikelihood(
+        [
+            ("The quick brown fox jumps over the lazy", " dog"),
+            ("The quick brown fox jumps over the lazy", " cat"),
+            ("The quick brown fox jumps over the lazy", ", lazy dog"),
+            ("The quick brown fox jumps over the lazy", ", lazy fox"),
+            (
+                "The quick brown fox jumps over the lazy",
+                ", lazy fox and they both fall to the ground",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            (
+                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
+                """ (with threshold activation); see § Terminology""",
+            ),
+            (
+                """Multilayer perceptrons are sometimes coll""",
+                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
+            ),
+            (
+                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
+                """ activation function.""",
+            ),
+            (
+                """MLP utilizes a supervised""",
+                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
+            ),
+            (
+                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
+                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
+            ),
+            (
+                """Specifically, we train GPT-3, an autoregressive language model with 175""",
+                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
            ("""Hello""", """ World"""),
-    ])
+        ]
+    )

    assert ll_dog > ll_cat
    assert not ig_cat
@@ -56,19 +94,26 @@ def test_gpt3():
    assert not ll_max_2

    # test empty context
-    gpt3.loglikelihood([('', 'test')])
+    gpt3.loglikelihood([("", "test")])

-    gen, = gpt3.greedy_until([
-        ('The quick brown fox jumps over the lazy', ['.', '\n'])
-    ])
+    (gen,) = gpt3.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )

-    assert gen == ' dog'
+    assert gen == " dog"

    print([x[0] for x in vals])

    targets = [
-        -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-        -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
+        -34.848301606999996,
+        -47.148329679999996,
+        -45.44380149599999,
+        -5.285246016,
+        -133.97821690686004,
+        -321.2616693239001,
+        -658.0299524401041,
+        -34.848301606999996,
+        -7.525115,
    ]

    for (pred, _), tgt in zip(vals, targets):
@@ -77,17 +122,20 @@ def test_gpt3():

 @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3_perplexity():
-    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
    tgt = -84.38819608
    assert perplexity == pytest.approx(tgt, rel=1e-3)

    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+    with mock.patch.object(
+        models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
+    ) as mock_max_length:
        mock_max_length.return_value = 5
-        gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+        gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
    tgt = -101.81967209999999
    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
 import re
 from collections import defaultdict

-from lm_eval.decontamination.janitor import *
+from lm_eval.decontamination.janitor import (
+    Janitor,
+    form_ngrams,
+    word_ngrams,
+    split_indices,
+    word_ngrams_indices,
+)
+

 def simple_ngram(sequence, n):
    ngrams = list()
@@ -16,8 +23,10 @@ def simple_ngram(sequence, n):


 def test_form_ngrams():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )

    n_values = [1, 2, 3, 5, 13]
    for n in n_values:
@@ -26,9 +35,12 @@ def test_form_ngrams():
        assert len(comparison) == len(result_to_test)
        assert comparison == result_to_test

+
 def test_word_ngrams():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )

    words = sequence.split()

@@ -40,9 +52,12 @@ def test_word_ngrams():
        assert len(comparison) == len(result_to_test)
        assert result_to_test == comparison

+
 def test_split_indices():
-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )

    comparison = []
    current_word = ""
@@ -55,17 +70,22 @@ def test_split_indices():
                current_word = ""

    if current_word:
-        comparison.append((current_word, (len(sequence) - len(current_word), len(sequence) - 1)))
+        comparison.append(
+            (current_word, (len(sequence) - len(current_word), len(sequence) - 1))
+        )
        current_word = ""

    result_to_test = list(split_indices(sequence))
    assert len(comparison) == len(result_to_test)
-    assert(comparison == result_to_test)
+    assert comparison == result_to_test
+

 def test_word_ngrams_indices():

-    sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
+    sequence = (
+        "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
        " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
+    )

    n_values = [1, 2, 3, 5, 13]

@@ -82,8 +102,9 @@ def test_word_ngrams_indices():
                tracker[ngram] = end + 1

                # ignore partial word matches
-                if (start != 0 and sequence[start - 1] != " ") or \
-                   (end != len(sequence) - 1 and sequence[end + 1] != " "):
+                if (start != 0 and sequence[start - 1] != " ") or (
+                    end != len(sequence) - 1 and sequence[end + 1] != " "
+                ):
                    pass
                else:
                    break
@@ -94,6 +115,7 @@ def test_word_ngrams_indices():
        assert len(result_to_test) == len(comparison)
        assert result_to_test == comparison

+
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window

@@ -103,28 +125,33 @@ def test_janitor1():
    # First test using a 1gram and expected the first block before the filth to have some remaining
    # characters, but the second block should be completely removed.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filth = "filth"

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
        "This is a @line #containing "
+    )

-    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -136,39 +163,44 @@ def test_janitor1():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor2():

    # Second test using a 1gram and expected the first block before the filth to have some remaining
    # characters, and the second block is longer then 200 characters so should also have some remaining.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filth = "filth"

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

-    janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -180,37 +212,43 @@ def test_janitor2():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor3():

    # Same test as above but with a 6gram.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filth = "filth lots of dirty filthy filth"

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -222,45 +260,51 @@ def test_janitor3():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor4():

    # This test adds another block to that from the previous. The middle block should be entirely
    # removed as the 200 characters are removed from each side.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filth = "filth lots of dirty filthy filth"

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -272,44 +316,50 @@ def test_janitor4():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor5():

    # Same as above but using multiple different filth 6grams.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -322,52 +372,58 @@ def test_janitor5():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor6():

    # Same as above but now we add 10 filths and expect the same result, the following test does 11.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]

-    expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing " \
-                      " characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
-                      "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    expected_result = (
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing "
+        " characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -380,46 +436,50 @@ def test_janitor6():
    result = "".join(result)
    assert result == expected_result

+
 def test_janitor7():

    # Same as above but now we add 9 filths and expect the same result, the following test does 10.

-    sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "FILTH. lots of dirty filtHy FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "FILTH. lots of filtHy dirty FIlTh " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
-               "This is a @line #containing a certain number of characters, 76 to be exact. " \
+    sequence = (
        "This is a @line #containing a certain number of characters, 76 to be exact. "
-
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "FILTH. lots of dirty filtHy FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "FILTH. lots of filtHy dirty FIlTh "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+        "This is a @line #containing a certain number of characters, 76 to be exact. "
+    )

    filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]

    expected_result = ""

-    janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
+    janitor = Janitor(
+        ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
+    )
    result = janitor.clean_python(sequence)
    result = "".join(result)
    assert result == sequence
@@ -453,23 +513,3 @@ def test_janitor8():
    # cleaned = " ".join(jan.clean(source))
    # for contam in jan.dirt_ngrams:
    #     assert contam not in cleaned, contam
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -4,24 +4,59 @@ import lm_eval.models as models


 def test_gpt2():
-    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
-    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt2.loglikelihood([
-        ('The quick brown fox jumps over the lazy', ' dog'),
-        ('The quick brown fox jumps over the lazy', ' cat'),
-        ('The quick brown fox jumps over the lazy', ', lazy dog'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox'),
-        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
-        
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
-        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), 
-        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
-        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
-        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
-        ("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """), 
-        ("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""), 
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = gpt2.loglikelihood(
+        [
+            ("The quick brown fox jumps over the lazy", " dog"),
+            ("The quick brown fox jumps over the lazy", " cat"),
+            ("The quick brown fox jumps over the lazy", ", lazy dog"),
+            ("The quick brown fox jumps over the lazy", ", lazy fox"),
+            (
+                "The quick brown fox jumps over the lazy",
+                ", lazy fox and they both fall to the ground",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
+            (
+                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
+                """ (with threshold activation); see § Terminology""",
+            ),
+            (
+                """Multilayer perceptrons are sometimes coll""",
+                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
+            ),
+            (
+                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
+                """ activation function.""",
+            ),
+            (
+                """MLP utilizes a supervised""",
+                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
+            ),
+            (
+                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
+                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
+            ),
+            (
+                """Specifically, we train GPT-3, an autoregressive language model with 175""",
+                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
+            ),
+            (
+                """A mult""",
+                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+            ),
            ("""Hello""", """ World"""),
-    ])
+        ]
+    )

    assert ll_dog > ll_cat
    assert not ig_cat
@@ -31,17 +66,24 @@ def test_gpt2():
    assert ll_max_2

    # test empty context
-    gpt2.loglikelihood([('', 'test')])
+    gpt2.loglikelihood([("", "test")])

-    gen, = gpt2.greedy_until([
-        ('The quick brown fox jumps over the lazy', ['.', '\n'])
-    ])
+    (gen,) = gpt2.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )

-    assert gen == ', lazy fox and they both fall to the ground'
+    assert gen == ", lazy fox and they both fall to the ground"

    targets = [
-        -61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-        -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
+        -61.60536193847656,
+        -56.57843780517578,
+        -62.131004333496094,
+        -9.799489974975586,
+        -153.96334838867188,
+        -341.222900390625,
+        -731.1475830078125,
+        -61.60536193847656,
+        -8.682319641113281,
    ]

    for (pred, _), tgt in zip(vals, targets):
@@ -49,21 +91,57 @@ def test_gpt2():


 def test_gpt2_perplexity():
-    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([
-        -4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-        -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
-    ])
+    tgt = sum(
+        [
+            -4.9599953,
+            -8.069298,
+            -8.308624,
+            -10.178513,
+            -8.906924,
+            -1.9318912,
+            -7.745445,
+            -7.146077,
+            -5.2072,
+            -3.5882986,
+            -1.9957212,
+            -8.044922,
+            -0.20841774,
+            -5.1096807,
+            -0.099879116,
+            -8.888423,
+            -4.6180487,
+        ]
+    )
    assert perplexity == pytest.approx(tgt, rel=1e-3)

-    with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+    with mock.patch.object(
+        models.gpt2.HFLM, "max_length", new_callable=mock.PropertyMock
+    ) as mock_max_length:
        mock_max_length.return_value = 5
-        gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+        gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
        perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([
-        -4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-        -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
-    ])
+    tgt = sum(
+        [
+            -4.96001,
+            -8.069275,
+            -8.308612,
+            -10.178482,
+            -8.90691,
+            -4.037338,
+            -8.09261,
+            -11.662385,
+            -10.206891,
+            -4.425003,
+            -2.2563353,
+            -7.909143,
+            -1.9304147,
+            -7.3610134,
+            -2.3120654,
+            -7.3229,
+            -2.1643813,
+        ]
+    )
    assert perplexity == pytest.approx(tgt, rel=1e-3)