Commit 4d147bdd authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into task-guide

parents 011cc891 dc937d4b
"""
Outputs all 13-grams found in The Pile.
Loops through all documents and uses the logic found in janitor.py to extract 13-grams.
We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the
next stage. We also include the current pile document_id with each ngram instance to allow the
filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
resuming hard (and we had the storage).
Arguments
---------
--working_directory (-dir)
Directory containing the pile distribution. An "output" subdirectory will be created underneath
to store the bucketed 13-grams, checkpoint and done files. Default: current directory
--n_value (-n)
n value in n-gram, added for later use if ever needed. Default: 13
--bucket_count (-buckets)
Number of file buckets to use when generating 13grams. Default: 500
"""
import argparse
import pickle
import os
from pathlib import Path
import glob
import signal
from signal import SIGINT
from tqdm import tqdm
from scripts.clean_training_data.janitor import Janitor, word_ngrams
from scripts.clean_training_data.archiver import TextArchive, Reader
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__)
pile_document_count = 210607728
terminate = False
def handler(signal_received, frame):
global terminate
terminate = True
def get_pile(directory):
reader = Reader()
for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
for document in reader.read(file):
yield document
def close_buckets(buckets):
for bucket in buckets:
bucket.commit()
def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
output_directory = os.path.join(working_directory, "output")
os.makedirs(output_directory, exist_ok=True)
logger.info(f"Generating {n_value}-grams and bucketing.")
# Done file
done_file = os.path.join(output_directory, f"ngram_buckets.done")
if os.path.exists(done_file):
logger.info("ngrams already generated and bucketed, skipping")
return
# Checkpoint
checkpoint_file = os.path.join(output_directory, f"ngram_buckets.ckpt")
if os.path.exists(checkpoint_file):
start_id = pickle.load(open(checkpoint_file,"rb"))
else:
start_id = 0
logger.info(f"Starting at pile document index {start_id}")
bucket_files = [os.path.join(output_directory, f"ngrams_{i}.bkt.txt") for i in range(bucket_count)]
buckets = list(map(TextArchive, bucket_files))
janitor = Janitor()
current_id = 0
batch_size = 1000
batch_counter = 0
with tqdm(total=pile_document_count, dynamic_ncols=True, unit="docs") as progress:
for document in get_pile(working_directory):
if current_id < start_id:
if terminate:
close_buckets(buckets)
return
current_id += 1
progress.update()
continue
# Save checkpoint every "batch_size", only allow terminate after checkpoint
if batch_counter == batch_size:
progress.update(batch_size)
batch_counter = 0
pickle.dump(current_id, open(checkpoint_file,"wb"))
if terminate:
close_buckets(buckets)
return
ngrams = word_ngrams(janitor.normalize_string(document), n_value)
for ngram in ngrams:
bucket = hash(ngram) % len(buckets)
buckets[bucket].add_data(f"{ngram} {current_id}")
batch_counter += 1
current_id += 1
close_buckets(buckets)
Path(done_file).touch()
parser = argparse.ArgumentParser(description='Generate 13 grams from Pile.')
parser.add_argument("-dir", "--working_directory", default="")
parser.add_argument("-n", "--n_value", type=int, default=13)
parser.add_argument("-buckets", "--bucket_count", type=int, default=500)
if __name__ == '__main__':
# Handle sigint (ctrl-c) cleanly
previous_signal_int = signal.signal(SIGINT, handler)
logfile_path = "ngrams.log"
setup_logger_tqdm(logfile_path)
args = parser.parse_args()
do_ngrams_in_buckets(args.n_value, args.working_directory, args.bucket_count)
\ No newline at end of file
......@@ -41,6 +41,29 @@ def word_ngrams(s, n):
ngram_seqs = form_ngrams(iter(tokens), n)
return (" ".join(ngram) for ngram in ngram_seqs)
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
# current_word = ""
# history = []
# gap = False;
# start = 0
# end = 0
# for character in sequence:
# if character == " ":
# if not gap:
# gap = True
# history.append(current_word)
# end += len(current_word) - 1
# current_word = ""
# if len(history) == n:
# yield (tuple(history), start, end)
# del history[0]
# start = end + 1
# end = start
# else:
# gap = False
# current_word += character
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s):
......@@ -140,8 +163,9 @@ class Janitor:
def _split_chunks(self, dirty_string, dirty_parts):
clean_chunks = []
splice_idx = 0
end = -1
for i, (ngram, start, end) in enumerate(dirty_parts):
if i > self.too_dirty_cutoff:
if i >= self.too_dirty_cutoff:
return []
start = max(0, start - self.window_to_remove)
end = min(len(dirty_string), end + self.window_to_remove)
......@@ -150,6 +174,9 @@ class Janitor:
clean_chunks.append(dirty_string[splice_idx: start])
splice_idx = end
if end < len(dirty_string) - self.minimum_slice_length:
clean_chunks.append(dirty_string[end+1:])
return clean_chunks
##############
......@@ -186,101 +213,101 @@ class Janitor:
# Tests
#################################################################
def print_cpp():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
for i in range(1, 10, 2):
pprint(janitor_util.clean_ngram(source, string.punctuation, i))
for ngram, start, end in \
janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
def test_cpp():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant = "dirty boy. Clean he he"
jan_python = Janitor()
jan_cpp = Janitor()
jan_python.register_contaminant_python(contaminant)
jan_cpp.register_contaminant(contaminant)
assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
assert jan_python.clean_python(source) == jan_cpp.clean(source), \
(jan_python.clean_python(source), jan_cpp.clean(source))
print("Passed test, python==cpp")
def benchmark():
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
setup = \
"""
with open("data/enwik8", "r") as f:
data = f.read()
jan = Janitor(too_dirty_cutoff=1000)
jan.register_contaminant('''
theories is that there is a connection between &quot;geekdom&quot; and autism.
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
movement{{ref|Wired}}. This article, many professionals assert, is just one example of
the media's application of mental disease labels to what is actually variant normal behavior
&amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
interests, even when they seem unusual to others, are not in themselves signs of autism or
Asperger's syndrome. Others assert that it is actually the medical profession which is applying
mental disease labels to children who in the past would have simply been accepted as a little
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
Due to the recent publicity surrounding autism and autis
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
would last, took a cautious approach, prefering to save the revenue rather than investing it in
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
[[United Arab Emirates]]. After the Emirates gained independence in 1971,
''')
"""
n = 1
print(f"Timing {n} run on 100 MB")
print("Register contaminant")
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
print("Clean")
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
def test():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant = "dirty boy. Clean he he"
jan = Janitor(ngram_n=3)
jan.register_contaminant(contaminant)
cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam
filename = "data/saved_contam"
jan.save_contamination_ngrams(filename)
jan = Janitor(ngram_n=3)
jan.load_contamination_ngrams(filename)
cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam
if __name__ == "__main__":
test()
# print_cpp()
# test_cpp()
# benchmark()
# def print_cpp():
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# for i in range(1, 10, 2):
# pprint(janitor_util.clean_ngram(source, string.punctuation, i))
# for ngram, start, end in \
# janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
# print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
# def test_cpp():
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan_python = Janitor()
# jan_cpp = Janitor()
# jan_python.register_contaminant_python(contaminant)
# jan_cpp.register_contaminant(contaminant)
# assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
# assert jan_python.clean_python(source) == jan_cpp.clean(source), \
# (jan_python.clean_python(source), jan_cpp.clean(source))
# print("Passed test, python==cpp")
# def benchmark():
# # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
# setup = \
# """
# with open("data/enwik8", "r") as f:
# data = f.read()
# jan = Janitor(too_dirty_cutoff=1000)
# jan.register_contaminant('''
# theories is that there is a connection between &quot;geekdom&quot; and autism.
# This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
# The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
# movement{{ref|Wired}}. This article, many professionals assert, is just one example of
# the media's application of mental disease labels to what is actually variant normal behavior
# &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
# interests, even when they seem unusual to others, are not in themselves signs of autism or
# Asperger's syndrome. Others assert that it is actually the medical profession which is applying
# mental disease labels to children who in the past would have simply been accepted as a little
# different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
# Due to the recent publicity surrounding autism and autis
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# would last, took a cautious approach, prefering to save the revenue rather than investing it in
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
# with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
# ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
# ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
# Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
# [[United Arab Emirates]]. After the Emirates gained independence in 1971,
# ''')
# """
# n = 1
# print(f"Timing {n} run on 100 MB")
# print("Register contaminant")
# # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
# print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
# print("Clean")
# # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
# print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
# def test_janitor_general():
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan = Janitor(ngram_n=3)
# jan.register_contaminant(contaminant)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# filename = "data/saved_contam"
# jan.save_contamination_ngrams(filename)
# jan = Janitor(ngram_n=3)
# jan.load_contamination_ngrams(filename)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# if __name__ == "__main__":
# test()
# # print_cpp()
# # test_cpp()
# # benchmark()
"""
Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
unique documents with their unique document counts. Uses multiprocessing and very little memory
as we stream from presorted buckets. Will use a lot of disk though.
Arguments
---------
--working_directory (-dir)
Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
--move_dir (-move)
Directory to move processed 13grams too. Default: Do nothing
--process_count (-procs)
Number of processes to use. Default: 4
"""
import argparse
import glob
import os
from pathlib import Path
import re
import shutil
from tqdm import tqdm
from tqdm_multiprocess import TqdmMultiProcessPool
from scripts.clean_training_data.archiver import TextReader, TextArchive
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__)
# Multiprocessed
def process_bucket(bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm):
bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
done_file = os.path.join(processed_directory, f"ngram_bucket_processing_{bucket_id}.done")
if os.path.exists(done_file):
logger.info(f"bucket {bucket_id} already processed, skipping")
return
# For managing tqdm
file_size = os.path.getsize(bucket_file_path)
bucket_progress = tqdm_func(total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1)
current_file_position = 0
update_frequency = 100 * 1000000 # 100mb
update_counter = 0
# Iterate through and output ngrams which occur in more then 10 documents
bucket = TextReader(bucket_file_path)
output_file_path = bucket_file_path + ".processed"
output_archive = TextArchive(output_file_path, mode="wb")
current_ngram = ""
current_ngram_document_ids = set()
for line in bucket.read():
[ngram, document_id] = line.rsplit(" ", 1)
# Write ngram if more then 10 unique document occurences
if ngram != current_ngram:
if len(current_ngram_document_ids) > 10:
output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
current_ngram = ngram
current_ngram_document_ids = set()
current_ngram_document_ids.add(document_id)
# Update tqdm
update_counter += bucket.fh.tell() - current_file_position
current_file_position = bucket.fh.tell()
if update_counter > update_frequency:
bucket_progress.update(update_counter)
update_counter = 0
# Remainder
if len(current_ngram_document_ids) > 10:
output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
output_archive.commit()
Path(done_file).touch()
if move_dir:
shutil.move(output_file_path, move_dir)
global_tqdm.update()
def process_sorted_buckets(working_directory, move_dir, process_count):
bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
processed_directory = os.path.join(working_directory, "processed")
os.makedirs(processed_directory, exist_ok=True)
pool = TqdmMultiProcessPool(process_count)
tasks = [(process_bucket, (bucket_file, processed_directory, move_dir)) for bucket_file in bucket_file_paths]
global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
on_done = lambda _ : None
on_error = lambda _ : None
_ = pool.map(global_tqdm, tasks, on_error, on_done)
parser = argparse.ArgumentParser(description='Process 13 grams from sorted buckets.')
parser.add_argument("-dir", "--working_directory", default="")
parser.add_argument("-move", "--move_dir", default="")
parser.add_argument("-procs", "--process_count", type=int, default=4)
if __name__ == '__main__':
logfile_path = "process13grams.log"
setup_logger_tqdm(logfile_path)
args = parser.parse_args()
process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
\ No newline at end of file
"""
Iteratively runs gnu sort on each bucket, gnu handles the multiprocessing.
Arguments
---------
--working_directory (-dir)
Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
directory and the unsorted buckets are removed after.
"""
import glob
import argparse
import os
from pathlib import Path
import signal
from signal import SIGINT
import re
import subprocess
from tqdm import tqdm
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__)
terminate = False
def handler(signal_received, frame):
global terminate
terminate = True
def sort_13_gram_buckets(working_directory):
bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))
done_file = os.path.join(working_directory, f"ngram_bucket_sorting_{bucket_id}.done")
if os.path.exists(done_file):
logger.info(f"bucket {bucket_id} already processed, skipping")
return
sorted_file_path = bucket_file_path + ".sorted"
command = f"sort {bucket_file_path} > {sorted_file_path}"
logger.info(command)
subprocess.call(command, shell=True)
if terminate:
return
Path(done_file).touch()
os.remove(bucket_file_path)
parser = argparse.ArgumentParser(description='sort 13gram buckets')
parser.add_argument("-dir", "--working_directory", default="")
if __name__ == '__main__':
# Handle sigint (ctrl-c) cleanly
previous_signal_int = signal.signal(SIGINT, handler)
logfile_path = "sort13grambuckets.log"
setup_logger_tqdm(logfile_path)
args = parser.parse_args()
sort_13_gram_buckets(args.working_directory)
\ No newline at end of file
import transformers
import torch
import torch.nn.functional as F
import random
random.seed(42)
data = [
"A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
"The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
"Multilayer perceptrons are sometimes colloquially referred to as \"vanilla\" neural networks, especially when they have a single hidden layer.[1]",
"An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
"MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
"Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
"Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.",
"A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
"Hello World",
]
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
tgs = []
for dat in data:
random.seed(dat)
#print(model(tok.encode(dat, return_tensors="pt"))[0][0])
toks = tok.encode(dat, return_tensors="pt")
ind = random.randrange(len(toks[0])-1)
logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1] # [batch, seq, vocab]
res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
tgs.append( float(res[ind:].sum()))
print(r'("""' + tok.decode(toks[0, :ind+1]) + r'""", """' + tok.decode(toks[0, ind+1:]) + r'"""), ')
print(tgs)
\ No newline at end of file
......@@ -2,7 +2,7 @@ from lm_eval import tasks
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task Name", "Train", "Val", "Test", "Metrics"]
writer.headers = ["Task Name", "Train", "Val", "Test","Val/Test Docs", "Metrics"]
values = []
......@@ -15,7 +15,9 @@ def chk(tf):
for tname, Task in tasks.TASK_REGISTRY.items():
task = Task()
values.append([tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()),', '.join(task.aggregation().keys())])
v = [tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()), len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),', '.join(task.aggregation().keys())]
print(v)
values.append(v)
writer.value_matrix = values
......
......@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="lm_eval_harness",
name="lm_eval",
version="0.0.1",
author="Leo Gao",
author_email="lg@eleuther.ai",
......@@ -19,4 +19,28 @@ setuptools.setup(
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=[
"black==20.8b1",
"best_download>=0.0.6",
"datasets>=1.2.1",
"click>=7.1",
"scikit-learn>=0.24.1",
"torch>=1.7",
"transformers>=4.1",
"sqlitedict==1.6.0",
"pytablewriter==0.58.0",
"sacrebleu==1.5.0",
"pycountry==20.7.3",
"numexpr==2.7.2",
"lm_dataformat==0.0.19",
"pytest==6.2.3",
"pybind11==2.6.2",
"tqdm-multiprocess==0.0.11",
"zstandard==0.15.2",
"jsonlines==2.0.0",
"mock==4.0.3",
"openai==0.6.4",
"jieba==0.42.1",
"nagisa==0.2.7"
]
)
import os
import lm_eval.base as base
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
......@@ -11,10 +13,13 @@ import pytest
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
os.system("rm test_cache.db")
lm = base.CachingLM(models.get_model('dummy')(), "test_cache.db")
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
......@@ -26,7 +31,24 @@ def test_evaluator(taskname, Task):
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for string, in reqs:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
lm.loglikelihood_rolling = ll_perp_fn
limit = 10
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
# check taht caching is working
assert e1 == e2
import os
from collections import Counter
import shutil
import glob
from scripts.clean_training_data.janitor import *
from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
from scripts.clean_training_data.archiver import Archive, TextReader
def test_generate_13_grams_1():
data = """A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
Some other birds, mostly related to the shelducks, have "goose" as part of their names.
More distantly related members of the family Anatidae are swans, most of which are larger
than true geese, and ducks, which are smaller. The term "goose" may refer to either a male
or female bird, but when paired with "gander", refers specifically to a female one (the latter referring
to a male). Young birds before fledging are called goslings. The collective noun for a group of
geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when
flying close together, they are called a plump."""
data = data + data
# Simple Generation
n = 13
janitor = Janitor()
ngrams = word_ngrams(janitor.normalize_string(data), n)
comparison = list(ngrams)
comparison_counter = Counter(comparison)
print(len(comparison))
# print(comparison)
# Generating into buckets
test_working_directory = "test_generate_13_grams"
output_directory = os.path.join(test_working_directory, "output")
try:
shutil.rmtree(output_directory)
except FileNotFoundError:
pass
os.makedirs(test_working_directory, exist_ok=True)
archive = Archive(os.path.join(test_working_directory, "test.jsonl.zst"))
archive.add_data(data)
archive.commit()
bucket_count = 4
do_ngrams_in_buckets(n, test_working_directory, bucket_count)
# Rebuild from buckets
rebuilt_ngrams = []
bucket_file_paths = glob.glob(os.path.join(test_working_directory, "output", f"*.bkt.txt"))
for bucket_file_path in bucket_file_paths:
reader = TextReader(bucket_file_path)
for line in reader.read():
[ngram, document_id] = line.rsplit(" ", 1)
rebuilt_ngrams.append(ngram)
# Compare
result_counter = Counter(rebuilt_ngrams)
# print(len(result_counter))
# print(len(comparison_counter))
assert(len(result_counter) == len(comparison_counter))
# print(result_counter)
# print(comparison_counter)
assert(comparison_counter == result_counter)
\ No newline at end of file
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
import os
import json
import openai
import mock
import pickle
import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def completion(**kwargs):
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl"
if os.path.exists(fname):
with open(fname, 'rb') as fh:
return pickle.load(fh)
ret = openai.Completion.create(**kwargs)
with open(fname, 'wb') as fh:
pickle.dump(ret, fh)
return ret
os.makedirs("tests/testdata", exist_ok=True)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3():
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
('The quick brown fox jumps over the lazy', ', lazy dog'),
('The quick brown fox jumps over the lazy', ', lazy fox'),
('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""),
("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""),
("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""),
("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """),
("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""Hello""", """ World"""),
])
assert ll_dog > ll_cat
assert not ig_cat
assert ig_dog
assert not ll_max_0
assert not ll_max_1
assert not ll_max_2
# test empty context
gpt3.loglikelihood([('', 'test')])
gen, = gpt3.greedy_until([
('The quick brown fox jumps over the lazy', ['.', '\n'])
])
assert gen == ' dog'
print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3_perplexity():
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -84.38819608
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.93490880000002
assert perplexity == pytest.approx(tgt, rel=1e-3)
import re
from collections import defaultdict
from scripts.clean_training_data.janitor import *
def simple_ngram(sequence, n):
ngrams = list()
ngram = []
for x in sequence:
ngram.append(x)
if len(ngram) == n:
ngrams.append(tuple(ngram))
ngram = ngram[1:]
return ngrams
def test_form_ngrams():
sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
n_values = [1, 2, 3, 5, 13]
for n in n_values:
comparison = simple_ngram(sequence, n)
result_to_test = list(form_ngrams(iter(sequence), n))
assert len(comparison) == len(result_to_test)
assert comparison == result_to_test
def test_word_ngrams():
sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
words = sequence.split()
n_values = [1, 2, 3, 5, 13]
for n in n_values:
comparison = simple_ngram(words, n)
comparison = [" ".join(ngram) for ngram in comparison]
result_to_test = list(word_ngrams(sequence, n))
assert len(comparison) == len(result_to_test)
assert result_to_test == comparison
def test_split_indices():
sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
comparison = []
current_word = ""
for i, c in enumerate(sequence):
if c != " ":
current_word += c
else:
if current_word:
comparison.append((current_word, (i - len(current_word), i - 1)))
current_word = ""
if current_word:
comparison.append((current_word, (len(sequence) - len(current_word), len(sequence) - 1)))
current_word = ""
result_to_test = list(split_indices(sequence))
assert len(comparison) == len(result_to_test)
assert(comparison == result_to_test)
def test_word_ngrams_indices():
sequence = "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" \
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
n_values = [1, 2, 3, 5, 13]
for n in n_values:
ngrams = [" ".join(ngram) for ngram in simple_ngram(sequence.split(), n)]
tracker = defaultdict(int)
comparison = []
for ngram in ngrams:
while True:
start = sequence.find(ngram, tracker[ngram])
assert start != -1 # testing the test
end = start + len(ngram) - 1
tracker[ngram] = end + 1
# ignore partial word matches
if (start != 0 and sequence[start - 1] != " ") or \
(end != len(sequence) - 1 and sequence[end + 1] != " "):
pass
else:
break
comparison.append((ngram, (start, end)))
result_to_test = list(word_ngrams_indices(sequence, n))
assert len(result_to_test) == len(comparison)
assert result_to_test == comparison
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def test_janitor1():
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth = "filth"
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing "
janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor2():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth = "filth"
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing " \
" characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor = Janitor(ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor3():
# Same test as above but with a 6gram.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth = "filth lots of dirty filthy filth"
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing " \
" characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor4():
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth = "filth lots of dirty filthy filth"
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing " \
" characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor5():
# Same as above but using multiple different filth 6grams.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of filtHy dirty FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing " \
" characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
for filth in filths:
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == set(filths)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor6():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of filtHy dirty FIlTh " \
"FILTH. lots of filtHy dirty FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
expected_result = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing " \
" characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
for filth in filths:
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == set(filths)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor7():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence = "This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"FILTH. lots of dirty filtHy FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"FILTH. lots of filtHy dirty FIlTh " \
"FILTH. lots of filtHy dirty FIlTh " \
"FILTH. lots of filtHy dirty FIlTh " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. " \
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
expected_result = ""
janitor = Janitor(ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == sequence
for filth in filths:
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == set(filths)
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
def test_janitor8():
# This will test the save and load contams
pass
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan = Janitor(ngram_n=3)
# jan.register_contaminant(contaminant)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# filename = "data/saved_contam"
# jan.save_contamination_ngrams(filename)
# jan = Janitor(ngram_n=3)
# jan.load_contamination_ngrams(filename)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
import pytest
import lm_eval.metrics as metrics
import random
def test_bootstrapping():
random.seed(42)
arr = [random.random() for _ in range(1000)]
expected = metrics.mean_stderr(arr)
bootstrapped = metrics.bootstrap_stderr(metrics.mean, arr, iters=100000)
assert bootstrapped == pytest.approx(expected, abs=1e-4)
import pytest
import lm_eval.models as models
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
('The quick brown fox jumps over the lazy', ', lazy dog'),
('The quick brown fox jumps over the lazy', ', lazy fox'),
('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""),
("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""),
("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""),
("""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """),
("""Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general."""),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""Hello""", """ World"""),
])
assert ll_dog > ll_cat
assert not ig_cat
assert not ll_max_0
assert ll_max_1
assert ll_max_2
# test empty context
gpt2.loglikelihood([('', 'test')])
......@@ -18,4 +36,24 @@ def test_gpt2():
('The quick brown fox jumps over the lazy', ['.', '\n'])
])
assert gen == ', lazy fox and they both fall to the ground'
\ No newline at end of file
assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows
gpt2.max_length = 5
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
assert perplexity == pytest.approx(tgt, rel=1e-3)
......@@ -22,13 +22,19 @@ def test_basic_interface(taskname, Task):
for v in task.higher_is_better().values(): assert v in [True, False]
assert isinstance(task.VERSION, int)
# test deterministic docs
# (don't test train because it's slow)
task2 = Task()
limit = None
if taskname in ["triviaqa"]: limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), 100))
arr2 = list(islice(task2.validation_docs(), 100))
arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit))
assert arr == arr2
......@@ -38,8 +44,8 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2
if task.has_test_docs():
arr = list(islice(task.test_docs(), 100))
arr2 = list(islice(task2.test_docs(), 100))
arr = list(islice(task.test_docs(), limit))
arr2 = list(islice(task2.test_docs(), limit))
assert arr == arr2
......@@ -48,6 +54,16 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2
if task.has_training_docs():
arr = list(islice(task.training_docs(), limit))
arr2 = list(islice(task2.training_docs(), limit))
assert arr == arr2
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
......@@ -57,7 +73,7 @@ def test_documents_and_requests(taskname, Task):
fns = []
if task.has_training_docs(): fns.append(task.training_docs)
if task.has_validation_docs(): fns.append(task.validation_docs)
# test doce might not have labels
# test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns:
......@@ -71,8 +87,10 @@ def test_documents_and_requests(taskname, Task):
assert isinstance(tgt, str)
# space convention
assert txt[-1] != ' '
assert tgt[0] == ' ' or txt[-1] == '\n'
# allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
if len(txt) != 0:
assert txt[-1] != ' '
assert tgt[0] == ' ' or txt[-1] == '\n'
reqs = task.construct_requests(doc, txt)
......
from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v1():
gold = [
([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [30, 31, 32, 33]),
]
x = list(range(34))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=10,
context_len=1,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v2():
gold = [
([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [10, 11, 12]),
([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [13, 14, 15]),
([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [16, 17, 18]),
([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [19, 20, 21]),
([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [22, 23, 24]),
([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [25, 26, 27]),
([20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [28, 29, 30]),
([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [31, 32, 33]),
]
x = list(range(34))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=10,
context_len=8,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v3():
gold = [
([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10]),
([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11]),
([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12]),
([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [13]),
([4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14]),
([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [15]),
([6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16]),
([7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [17]),
([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [18]),
([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [19]),
([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20]),
([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21]),
([12, 13, 14, 15, 16, 17, 18, 19, 20, 21], [22]),
([13, 14, 15, 16, 17, 18, 19, 20, 21, 22], [23]),
([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24]),
([15, 16, 17, 18, 19, 20, 21, 22, 23, 24], [25]),
([16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [26]),
([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [27]),
([18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [28]),
([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29]),
([20, 21, 22, 23, 24, 25, 26, 27, 28, 29], [30]),
([21, 22, 23, 24, 25, 26, 27, 28, 29, 30], [31]),
([22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32]),
([23, 24, 25, 26, 27, 28, 29, 30, 31, 32], [33]),
]
x = list(range(34))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=10,
context_len=10,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v4():
gold = [
([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10]),
([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11]),
([2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12]),
([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [13]),
([4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14]),
([5, 6, 7, 8, 9, 10, 11, 12, 13, 14], [15]),
([6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16]),
([7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [17]),
([8, 9, 10, 11, 12, 13, 14, 15, 16, 17], [18]),
([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [19]),
([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [20]),
([11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21]),
([12, 13, 14, 15, 16, 17, 18, 19, 20, 21], [22]),
([13, 14, 15, 16, 17, 18, 19, 20, 21, 22], [23]),
([14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [24]),
([15, 16, 17, 18, 19, 20, 21, 22, 23, 24], [25]),
([16, 17, 18, 19, 20, 21, 22, 23, 24, 25], [26]),
([17, 18, 19, 20, 21, 22, 23, 24, 25, 26], [27]),
([18, 19, 20, 21, 22, 23, 24, 25, 26, 27], [28]),
([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29]),
]
x = list(range(30))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=10,
context_len=10,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v5():
gold = [
([-100, 0, 1, 2, 3, 4, 5, 6, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
([9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
([19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
]
x = list(range(30))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=10,
context_len=1,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
# noinspection DuplicatedCode
def test_get_rolling_token_windows_v6():
gold = [
([-100, 0], [0, 1]),
([1, 2], [2, 3]),
([3, 4], [4, 5]),
([5, 6], [6, 7]),
([6, 7], [8]),
]
x = list(range(9))
generator = get_rolling_token_windows(
token_list=x,
prefix_token=-100,
max_seq_len=2,
context_len=1,
)
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
def test_get_rolling_token_windows_empty():
generator = get_rolling_token_windows(
token_list=[],
prefix_token=-100,
max_seq_len=2,
context_len=1,
)
n = 0
for _ in generator:
n += 1
assert n == 0
def test_make_disjoint_window():
assert make_disjoint_window(([1,2,3,4,5], [2,3,4,5,6])) == ([1], [2,3,4,5,6])
assert make_disjoint_window(([1,2,3,4,5], [4,5,6])) == ([1,2,3], [4,5,6])
\ No newline at end of file
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
import os
import json
import hashlib
os.makedirs("tests/testdata", exist_ok=True)
def assert_target(name, ob):
fname = f"tests/testdata/{name}.json"
if os.path.exists(fname):
with open(fname) as fh:
assert json.load(fh) == json.loads(json.dumps(ob, sort_keys=True))
else:
with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}"
if os.path.exists(fname):
with open(fname) as fh:
assert fh.read() == hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest()
else:
with open(fname, 'w') as fh:
fh.write(hashlib.sha256(json.dumps(ob, sort_keys=True).encode('utf-8')).hexdigest())
# make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0: continue
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for string, in reqs:
assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
def greedy_until(reqs):
res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
for ctx, _ in reqs:
res.append("lol")
assert ctx.strip() != ''
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
lm.greedy_until = greedy_until
limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
assert_target(f"{taskname}-v{Task.VERSION}-res", res)
7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1
\ No newline at end of file
{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
\ No newline at end of file
6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment