Revert `tests/testdata` changes and address flake8 issues

8c997e53 · jon-tow · d95a4333 · 8c997e53 · 8c997e53 · 8c997e53
Commit 8c997e53 authored May 03, 2022 by jon-tow
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+# Ignore test linting to avoid conflicting changes to version stability.
+exclude: ^tests/testdata/
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.1.0

--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -9,6 +9,7 @@ import collections
 from .janitor import Janitor, word_ngrams
 from .archiver import ZStdTextReader
 # Was used for testing the evaluator decoupled from the full logic below
 def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
    simulated_overlap = 0.1

--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -11,7 +11,7 @@ try:
    import janitor_util
    JANITOR_CPP = True
-except Exception as e:
+except Exception:
    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
    traceback.print_exc()
    JANITOR_CPP = False

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -22,14 +22,12 @@ from . import naturalqs
 from . import sat
 from . import arithmetic
 from . import lambada
-from . import race
 from . import piqa
 from . import prost
 from . import mc_taco
 from . import triviaqa
 from . import pubmedqa
 from . import sciq
-from . import webqs
 from . import qasper
 from . import qa4mre
 from . import translation
@@ -294,7 +292,7 @@ ALL_TASKS = sorted(list(TASK_REGISTRY))
 def get_task(task_name):
    try:
        return TASK_REGISTRY[task_name]
-    except KeyError as e:
+    except KeyError:
        print("Available tasks:")
        pprint(TASK_REGISTRY)
        raise KeyError(f"Missing task {task_name}")

--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -28,7 +28,7 @@ _CITATION = """
    eprint = {https://doi.org/10.1162/tacl_a_00321},
    abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
 }
-"""
+"""  # noqa: W605
 class BlimpTask(Task):

--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -98,7 +98,7 @@ class Math(Task):
            if verbose:
                print(ss1, ss2)
            return ss1 == ss2
-        except:
+        except Exception:
            return str1 == str2
    def remove_boxed(self, s):
@@ -246,7 +246,7 @@ class Math(Task):
        # remove percentage
        string = string.replace("\\%", "")
-        string = string.replace("\%", "")
+        string = string.replace("\%", "")  # noqa: W605
        # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
        string = string.replace(" .", " 0.")

--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -71,7 +71,7 @@ class NaturalQs(Task):
    def doc_to_target(self, doc):
        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
-        short_answer = doc["annotations"]["short_answers"][0]["text"]
+        # short_answer = doc["annotations"]["short_answers"][0]["text"]
        long_answer_start = doc["annotations"]["long_answer"][0]["start_token"]
        long_answer_end = doc["annotations"]["long_answer"][0]["end_token"]
        long_answer_span = doc["document"]["tokens"]["token"][

--- a/lm_eval/tasks/qa4mre.py
+++ b/lm_eval/tasks/qa4mre.py
@@ -23,7 +23,7 @@ _CITATION = """
    booktitle={CLEF},
    year={2013}
 }
-"""
+"""  # noqa: W605
 class QA4MRE(MultipleChoiceTask):

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -390,6 +390,7 @@ class TruthfulQAGeneration(Task):
        rouge_types = ["rouge1", "rouge2", "rougeLsum"]
        scorer = rouge_scorer.RougeScorer(rouge_types)
        # Add newlines between sentences to correctly compute `rougeLsum`.
        def _prepare_summary(summary):
            summary = summary.replace(" . ", ".\n")
            return summary

--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
@@ -42,8 +42,12 @@ def compress_and_move(working_directory, output_directory, process_count):
        tasks.append(task)
    pool = TqdmMultiProcessPool(process_count)
-    on_done = lambda _: None
-    on_error = lambda _: None
+    def on_done(_):
+        return None
+    def on_error(_):
+        return None
    global_progress = tqdm(
        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"

--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -51,8 +51,12 @@ def get_stats():
    # Generate minhashes with pool
    tasks = [(get_file_stats, (file,)) for file in files]
-    on_done = lambda _: None
+    def on_done(_):
-    on_error = lambda _: None
+        return None
+    def on_error(_):
+        return None
    results = pool.map(global_tqdm, tasks, on_error, on_done)
    total_documents, total_size = reduce(

--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -30,12 +30,13 @@ from tqdm_multiprocess.logger import setup_logger_tqdm
 logger = logging.getLogger(__name__)
 # Multiprocessed
 def process_bucket(
    bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
 ):
-    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
+    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
    done_file = os.path.join(
        processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
    )
@@ -106,8 +107,13 @@ def process_sorted_buckets(working_directory, move_dir, process_count):
    ]
    global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
-    on_done = lambda _: None
-    on_error = lambda _: None
+    def on_done(_):
+        return None
+    def on_error(_):
+        return None
    _ = pool.map(global_tqdm, tasks, on_error, on_done)

--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
@@ -3,7 +3,7 @@ from collections import Counter
 import shutil
 import glob
-from lm_eval.decontamination.janitor import *
+from lm_eval.decontamination.janitor import Janitor, word_ngrams
 from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
 from lm_eval.decontamination.archiver import Archive, TextReader

--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
 import re
 from collections import defaultdict
-from lm_eval.decontamination.janitor import *
+from lm_eval.decontamination.janitor import (
+    Janitor,
+    form_ngrams,
+    word_ngrams,
+    split_indices,
+    word_ngrams_indices,
+)
 def simple_ngram(sequence, n):

--- a/tests/testdata/anagrams1-v0-greedy_until
+++ b/tests/testdata/anagrams1-v0-greedy_until
--- a/tests/testdata/anagrams1-v0-res.json
+++ b/tests/testdata/anagrams1-v0-res.json
--- a/tests/testdata/anagrams2-v0-greedy_until
+++ b/tests/testdata/anagrams2-v0-greedy_until
--- a/tests/testdata/anagrams2-v0-res.json
+++ b/tests/testdata/anagrams2-v0-res.json
--- a/tests/testdata/anli_r1-v0-loglikelihood
+++ b/tests/testdata/anli_r1-v0-loglikelihood