Commit 8c997e53 authored by jon-tow's avatar jon-tow
Browse files

Revert `tests/testdata` changes and address flake8 issues

parent d95a4333
# Ignore test linting to avoid conflicting changes to version stability.
exclude: ^tests/testdata/
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0 rev: v4.1.0
......
...@@ -9,6 +9,7 @@ import collections ...@@ -9,6 +9,7 @@ import collections
from .janitor import Janitor, word_ngrams from .janitor import Janitor, word_ngrams
from .archiver import ZStdTextReader from .archiver import ZStdTextReader
# Was used for testing the evaluator decoupled from the full logic below # Was used for testing the evaluator decoupled from the full logic below
def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size): def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
simulated_overlap = 0.1 simulated_overlap = 0.1
......
...@@ -11,7 +11,7 @@ try: ...@@ -11,7 +11,7 @@ try:
import janitor_util import janitor_util
JANITOR_CPP = True JANITOR_CPP = True
except Exception as e: except Exception:
print("WARNING: C++ module could not be loaded. Janitor running in python mode") print("WARNING: C++ module could not be loaded. Janitor running in python mode")
traceback.print_exc() traceback.print_exc()
JANITOR_CPP = False JANITOR_CPP = False
......
...@@ -22,14 +22,12 @@ from . import naturalqs ...@@ -22,14 +22,12 @@ from . import naturalqs
from . import sat from . import sat
from . import arithmetic from . import arithmetic
from . import lambada from . import lambada
from . import race
from . import piqa from . import piqa
from . import prost from . import prost
from . import mc_taco from . import mc_taco
from . import triviaqa from . import triviaqa
from . import pubmedqa from . import pubmedqa
from . import sciq from . import sciq
from . import webqs
from . import qasper from . import qasper
from . import qa4mre from . import qa4mre
from . import translation from . import translation
...@@ -294,7 +292,7 @@ ALL_TASKS = sorted(list(TASK_REGISTRY)) ...@@ -294,7 +292,7 @@ ALL_TASKS = sorted(list(TASK_REGISTRY))
def get_task(task_name): def get_task(task_name):
try: try:
return TASK_REGISTRY[task_name] return TASK_REGISTRY[task_name]
except KeyError as e: except KeyError:
print("Available tasks:") print("Available tasks:")
pprint(TASK_REGISTRY) pprint(TASK_REGISTRY)
raise KeyError(f"Missing task {task_name}") raise KeyError(f"Missing task {task_name}")
......
...@@ -28,7 +28,7 @@ _CITATION = """ ...@@ -28,7 +28,7 @@ _CITATION = """
eprint = {https://doi.org/10.1162/tacl_a_00321}, eprint = {https://doi.org/10.1162/tacl_a_00321},
abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. } abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
} }
""" """ # noqa: W605
class BlimpTask(Task): class BlimpTask(Task):
......
...@@ -98,7 +98,7 @@ class Math(Task): ...@@ -98,7 +98,7 @@ class Math(Task):
if verbose: if verbose:
print(ss1, ss2) print(ss1, ss2)
return ss1 == ss2 return ss1 == ss2
except: except Exception:
return str1 == str2 return str1 == str2
def remove_boxed(self, s): def remove_boxed(self, s):
...@@ -246,7 +246,7 @@ class Math(Task): ...@@ -246,7 +246,7 @@ class Math(Task):
# remove percentage # remove percentage
string = string.replace("\\%", "") string = string.replace("\\%", "")
string = string.replace("\%", "") string = string.replace("\%", "") # noqa: W605
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.") string = string.replace(" .", " 0.")
......
...@@ -71,7 +71,7 @@ class NaturalQs(Task): ...@@ -71,7 +71,7 @@ class NaturalQs(Task):
def doc_to_target(self, doc): def doc_to_target(self, doc):
# There's a short answer and a long answer. Based on the paper, I'm using the long answer. # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
short_answer = doc["annotations"]["short_answers"][0]["text"] # short_answer = doc["annotations"]["short_answers"][0]["text"]
long_answer_start = doc["annotations"]["long_answer"][0]["start_token"] long_answer_start = doc["annotations"]["long_answer"][0]["start_token"]
long_answer_end = doc["annotations"]["long_answer"][0]["end_token"] long_answer_end = doc["annotations"]["long_answer"][0]["end_token"]
long_answer_span = doc["document"]["tokens"]["token"][ long_answer_span = doc["document"]["tokens"]["token"][
......
...@@ -23,7 +23,7 @@ _CITATION = """ ...@@ -23,7 +23,7 @@ _CITATION = """
booktitle={CLEF}, booktitle={CLEF},
year={2013} year={2013}
} }
""" """ # noqa: W605
class QA4MRE(MultipleChoiceTask): class QA4MRE(MultipleChoiceTask):
......
...@@ -390,6 +390,7 @@ class TruthfulQAGeneration(Task): ...@@ -390,6 +390,7 @@ class TruthfulQAGeneration(Task):
rouge_types = ["rouge1", "rouge2", "rougeLsum"] rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types) scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`. # Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary): def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n") summary = summary.replace(" . ", ".\n")
return summary return summary
......
...@@ -42,8 +42,12 @@ def compress_and_move(working_directory, output_directory, process_count): ...@@ -42,8 +42,12 @@ def compress_and_move(working_directory, output_directory, process_count):
tasks.append(task) tasks.append(task)
pool = TqdmMultiProcessPool(process_count) pool = TqdmMultiProcessPool(process_count)
on_done = lambda _: None
on_error = lambda _: None def on_done(_):
return None
def on_error(_):
return None
global_progress = tqdm( global_progress = tqdm(
total=len(bucket_file_paths), dynamic_ncols=True, unit="file" total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
......
...@@ -51,8 +51,12 @@ def get_stats(): ...@@ -51,8 +51,12 @@ def get_stats():
# Generate minhashes with pool # Generate minhashes with pool
tasks = [(get_file_stats, (file,)) for file in files] tasks = [(get_file_stats, (file,)) for file in files]
on_done = lambda _: None def on_done(_):
on_error = lambda _: None return None
def on_error(_):
return None
results = pool.map(global_tqdm, tasks, on_error, on_done) results = pool.map(global_tqdm, tasks, on_error, on_done)
total_documents, total_size = reduce( total_documents, total_size = reduce(
......
...@@ -30,12 +30,13 @@ from tqdm_multiprocess.logger import setup_logger_tqdm ...@@ -30,12 +30,13 @@ from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Multiprocessed # Multiprocessed
def process_bucket( def process_bucket(
bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
): ):
bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605
done_file = os.path.join( done_file = os.path.join(
processed_directory, f"ngram_bucket_processing_{bucket_id}.done" processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
) )
...@@ -106,8 +107,13 @@ def process_sorted_buckets(working_directory, move_dir, process_count): ...@@ -106,8 +107,13 @@ def process_sorted_buckets(working_directory, move_dir, process_count):
] ]
global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket") global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
on_done = lambda _: None
on_error = lambda _: None def on_done(_):
return None
def on_error(_):
return None
_ = pool.map(global_tqdm, tasks, on_error, on_done) _ = pool.map(global_tqdm, tasks, on_error, on_done)
......
...@@ -3,7 +3,7 @@ from collections import Counter ...@@ -3,7 +3,7 @@ from collections import Counter
import shutil import shutil
import glob import glob
from lm_eval.decontamination.janitor import * from lm_eval.decontamination.janitor import Janitor, word_ngrams
from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
from lm_eval.decontamination.archiver import Archive, TextReader from lm_eval.decontamination.archiver import Archive, TextReader
......
import re import re
from collections import defaultdict from collections import defaultdict
from lm_eval.decontamination.janitor import * from lm_eval.decontamination.janitor import (
Janitor,
form_ngrams,
word_ngrams,
split_indices,
word_ngrams_indices,
)
def simple_ngram(sequence, n): def simple_ngram(sequence, n):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment