Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2bb67c98
Commit
2bb67c98
authored
May 15, 2021
by
Leo Gao
Browse files
Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness
parents
9c76176b
99d40991
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1068 additions
and
102 deletions
+1068
-102
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+1
-3
scripts/__init__.py
scripts/__init__.py
+0
-0
scripts/clean_training_data/__init__.py
scripts/clean_training_data/__init__.py
+0
-0
scripts/clean_training_data/archiver.py
scripts/clean_training_data/archiver.py
+90
-0
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+131
-0
scripts/clean_training_data/janitor.py
scripts/clean_training_data/janitor.py
+126
-99
scripts/clean_training_data/janitor_util.cpp
scripts/clean_training_data/janitor_util.cpp
+0
-0
scripts/clean_training_data/process_sorted_buckets.py
scripts/clean_training_data/process_sorted_buckets.py
+111
-0
scripts/clean_training_data/sort_13_gram_buckets.py
scripts/clean_training_data/sort_13_gram_buckets.py
+64
-0
setup.py
setup.py
+5
-0
tests/test_generate_13_grams.py
tests/test_generate_13_grams.py
+65
-0
tests/test_janitor.py
tests/test_janitor.py
+475
-0
No files found.
lm_eval/models/gpt2.py
View file @
2bb67c98
...
@@ -71,10 +71,8 @@ class GPT2LM(LM):
...
@@ -71,10 +71,8 @@ class GPT2LM(LM):
loglikelihoods
=
[]
loglikelihoods
=
[]
with
torch
.
no_grad
():
with
torch
.
no_grad
():
for
string
,
in
tqdm
(
requests
):
for
string
,
in
tqdm
(
requests
):
encoded
=
self
.
tokenizer
.
encode_plus
(
string
)[
"input_ids"
]
rolling_token_windows
=
list
(
map
(
utils
.
make_disjoint_window
,
utils
.
get_rolling_token_windows
(
rolling_token_windows
=
list
(
map
(
utils
.
make_disjoint_window
,
utils
.
get_rolling_token_windows
(
token_list
=
encoded
,
token_list
=
self
.
tokenizer
.
encode
(
string
)
,
prefix_token
=
self
.
EOT_TOKEN_ID
,
prefix_token
=
self
.
EOT_TOKEN_ID
,
max_seq_len
=
self
.
max_length
,
max_seq_len
=
self
.
max_length
,
context_len
=
1
,
context_len
=
1
,
...
...
scripts/__init__.py
0 → 100644
View file @
2bb67c98
scripts/clean_training_data/__init__.py
0 → 100644
View file @
2bb67c98
scripts/clean_training_data/archiver.py
0 → 100644
View file @
2bb67c98
import
os
import
zstandard
import
json
import
jsonlines
import
io
import
datetime
def
json_serial
(
obj
):
"""JSON serializer for objects not serializable by default json code"""
if
isinstance
(
obj
,
(
datetime
.
datetime
,)):
return
obj
.
isoformat
()
raise
TypeError
(
"Type %s not serializable"
%
type
(
obj
))
# Modified version of lm_dataformat Archive for single file.
class
Archive
:
def
__init__
(
self
,
file_path
,
compression_level
=
3
):
self
.
file_path
=
file_path
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
:
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
fh
=
open
(
self
.
file_path
,
'wb'
)
self
.
cctx
=
zstandard
.
ZstdCompressor
(
level
=
compression_level
)
self
.
compressor
=
self
.
cctx
.
stream_writer
(
self
.
fh
)
def
add_data
(
self
,
data
,
meta
=
{}):
self
.
compressor
.
write
(
json
.
dumps
({
'text'
:
data
,
'meta'
:
meta
},
default
=
json_serial
).
encode
(
'UTF-8'
)
+
b
'
\n
'
)
def
commit
(
self
):
self
.
compressor
.
flush
(
zstandard
.
FLUSH_FRAME
)
self
.
fh
.
flush
()
self
.
fh
.
close
()
# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
class
Reader
:
def
__init__
(
self
):
pass
def
read
(
self
,
file
,
get_meta
=
False
,
autojoin_paragraphs
=
True
,
para_joiner
=
'
\n\n
'
):
with
open
(
file
,
'rb'
)
as
fh
:
self
.
fh
=
fh
cctx
=
zstandard
.
ZstdDecompressor
()
reader
=
io
.
BufferedReader
(
cctx
.
stream_reader
(
fh
))
rdr
=
jsonlines
.
Reader
(
reader
)
for
ob
in
rdr
:
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
if
isinstance
(
ob
,
str
):
assert
not
get_meta
yield
ob
continue
text
=
ob
[
'text'
]
if
autojoin_paragraphs
and
isinstance
(
text
,
list
):
text
=
para_joiner
.
join
(
text
)
if
get_meta
:
yield
text
,
(
ob
[
'meta'
]
if
'meta'
in
ob
else
{})
else
:
yield
text
# Simple text reader and writer with same interface as above
class
TextArchive
:
def
__init__
(
self
,
file_path
,
mode
=
"ab"
):
self
.
file_path
=
file_path
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
:
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
fh
=
open
(
self
.
file_path
,
mode
)
def
add_data
(
self
,
data
,
meta
=
{}):
self
.
fh
.
write
(
data
.
encode
(
'UTF-8'
)
+
b
'
\n
'
)
def
commit
(
self
):
self
.
fh
.
flush
()
self
.
fh
.
close
()
class
TextReader
:
def
__init__
(
self
,
file_path
):
self
.
file_path
=
file_path
def
read
(
self
):
with
open
(
self
.
file_path
,
'r'
,
encoding
=
"utf8"
)
as
fh
:
self
.
fh
=
fh
while
True
:
line
=
self
.
fh
.
readline
()
if
line
==
-
1
or
line
==
""
:
break
else
:
yield
line
[:
-
1
]
\ No newline at end of file
scripts/clean_training_data/generate_13_grams.py
0 → 100644
View file @
2bb67c98
"""
Outputs all 13-grams found in The Pile.
Loops through all documents and uses the logic found in janitor.py to extract 13-grams.
We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the
next stage. We also include the current pile document_id with each ngram instance to allow the
filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
resuming hard (and we had the storage).
Arguments
---------
--working_directory (-dir)
Directory containing the pile distribution. An "output" subdirectory will be created underneath
to store the bucketed 13-grams, checkpoint and done files. Default: current directory
--n_value (-n)
n value in n-gram, added for later use if ever needed. Default: 13
--bucket_count (-buckets)
Number of file buckets to use when generating 13grams. Default: 500
"""
import
argparse
import
pickle
import
os
from
pathlib
import
Path
import
glob
import
signal
from
signal
import
SIGINT
from
tqdm
import
tqdm
from
scripts.clean_training_data.janitor
import
Janitor
,
word_ngrams
from
scripts.clean_training_data.archiver
import
TextArchive
,
Reader
import
logging
from
tqdm_multiprocess.logger
import
setup_logger_tqdm
logger
=
logging
.
getLogger
(
__name__
)
pile_document_count
=
210607728
terminate
=
False
def
handler
(
signal_received
,
frame
):
global
terminate
terminate
=
True
def
get_pile
(
directory
):
reader
=
Reader
()
for
file
in
glob
.
glob
(
os
.
path
.
join
(
directory
,
f
"*.jsonl.zst*"
)):
for
document
in
reader
.
read
(
file
):
yield
document
def
close_buckets
(
buckets
):
for
bucket
in
buckets
:
bucket
.
commit
()
def
do_ngrams_in_buckets
(
n_value
,
working_directory
,
bucket_count
):
output_directory
=
os
.
path
.
join
(
working_directory
,
"output"
)
os
.
makedirs
(
output_directory
,
exist_ok
=
True
)
logger
.
info
(
f
"Generating
{
n_value
}
-grams and bucketing."
)
# Done file
done_file
=
os
.
path
.
join
(
output_directory
,
f
"ngram_buckets.done"
)
if
os
.
path
.
exists
(
done_file
):
logger
.
info
(
"ngrams already generated and bucketed, skipping"
)
return
# Checkpoint
checkpoint_file
=
os
.
path
.
join
(
output_directory
,
f
"ngram_buckets.ckpt"
)
if
os
.
path
.
exists
(
checkpoint_file
):
start_id
=
pickle
.
load
(
open
(
checkpoint_file
,
"rb"
))
else
:
start_id
=
0
logger
.
info
(
f
"Starting at pile document index
{
start_id
}
"
)
bucket_files
=
[
os
.
path
.
join
(
output_directory
,
f
"ngrams_
{
i
}
.bkt.txt"
)
for
i
in
range
(
bucket_count
)]
buckets
=
list
(
map
(
TextArchive
,
bucket_files
))
janitor
=
Janitor
()
current_id
=
0
batch_size
=
1000
batch_counter
=
0
with
tqdm
(
total
=
pile_document_count
,
dynamic_ncols
=
True
,
unit
=
"docs"
)
as
progress
:
for
document
in
get_pile
(
working_directory
):
if
current_id
<
start_id
:
if
terminate
:
close_buckets
(
buckets
)
return
current_id
+=
1
progress
.
update
()
continue
# Save checkpoint every "batch_size", only allow terminate after checkpoint
if
batch_counter
==
batch_size
:
progress
.
update
(
batch_size
)
batch_counter
=
0
pickle
.
dump
(
current_id
,
open
(
checkpoint_file
,
"wb"
))
if
terminate
:
close_buckets
(
buckets
)
return
ngrams
=
word_ngrams
(
janitor
.
normalize_string
(
document
),
n_value
)
for
ngram
in
ngrams
:
bucket
=
hash
(
ngram
)
%
len
(
buckets
)
buckets
[
bucket
].
add_data
(
f
"
{
ngram
}
{
current_id
}
"
)
batch_counter
+=
1
current_id
+=
1
close_buckets
(
buckets
)
Path
(
done_file
).
touch
()
parser
=
argparse
.
ArgumentParser
(
description
=
'Generate 13 grams from Pile.'
)
parser
.
add_argument
(
"-dir"
,
"--working_directory"
,
default
=
""
)
parser
.
add_argument
(
"-n"
,
"--n_value"
,
type
=
int
,
default
=
13
)
parser
.
add_argument
(
"-buckets"
,
"--bucket_count"
,
type
=
int
,
default
=
500
)
if
__name__
==
'__main__'
:
# Handle sigint (ctrl-c) cleanly
previous_signal_int
=
signal
.
signal
(
SIGINT
,
handler
)
logfile_path
=
"ngrams.log"
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
do_ngrams_in_buckets
(
args
.
n_value
,
args
.
working_directory
,
args
.
bucket_count
)
\ No newline at end of file
scripts/clean_training_data/janitor.py
View file @
2bb67c98
...
@@ -41,6 +41,29 @@ def word_ngrams(s, n):
...
@@ -41,6 +41,29 @@ def word_ngrams(s, n):
ngram_seqs
=
form_ngrams
(
iter
(
tokens
),
n
)
ngram_seqs
=
form_ngrams
(
iter
(
tokens
),
n
)
return
(
" "
.
join
(
ngram
)
for
ngram
in
ngram_seqs
)
return
(
" "
.
join
(
ngram
)
for
ngram
in
ngram_seqs
)
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
# current_word = ""
# history = []
# gap = False;
# start = 0
# end = 0
# for character in sequence:
# if character == " ":
# if not gap:
# gap = True
# history.append(current_word)
# end += len(current_word) - 1
# current_word = ""
# if len(history) == n:
# yield (tuple(history), start, end)
# del history[0]
# start = end + 1
# end = start
# else:
# gap = False
# current_word += character
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def
split_indices
(
s
):
def
split_indices
(
s
):
...
@@ -140,8 +163,9 @@ class Janitor:
...
@@ -140,8 +163,9 @@ class Janitor:
def
_split_chunks
(
self
,
dirty_string
,
dirty_parts
):
def
_split_chunks
(
self
,
dirty_string
,
dirty_parts
):
clean_chunks
=
[]
clean_chunks
=
[]
splice_idx
=
0
splice_idx
=
0
end
=
-
1
for
i
,
(
ngram
,
start
,
end
)
in
enumerate
(
dirty_parts
):
for
i
,
(
ngram
,
start
,
end
)
in
enumerate
(
dirty_parts
):
if
i
>
self
.
too_dirty_cutoff
:
if
i
>
=
self
.
too_dirty_cutoff
:
return
[]
return
[]
start
=
max
(
0
,
start
-
self
.
window_to_remove
)
start
=
max
(
0
,
start
-
self
.
window_to_remove
)
end
=
min
(
len
(
dirty_string
),
end
+
self
.
window_to_remove
)
end
=
min
(
len
(
dirty_string
),
end
+
self
.
window_to_remove
)
...
@@ -150,6 +174,9 @@ class Janitor:
...
@@ -150,6 +174,9 @@ class Janitor:
clean_chunks
.
append
(
dirty_string
[
splice_idx
:
start
])
clean_chunks
.
append
(
dirty_string
[
splice_idx
:
start
])
splice_idx
=
end
splice_idx
=
end
if
end
<
len
(
dirty_string
)
-
self
.
minimum_slice_length
:
clean_chunks
.
append
(
dirty_string
[
end
+
1
:])
return
clean_chunks
return
clean_chunks
##############
##############
...
@@ -186,101 +213,101 @@ class Janitor:
...
@@ -186,101 +213,101 @@ class Janitor:
# Tests
# Tests
#################################################################
#################################################################
def
print_cpp
():
#
def print_cpp():
source
=
""" ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy.
\n\n
he he he hehe heh. lastword """
*
2
#
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
for
i
in
range
(
1
,
10
,
2
):
#
for i in range(1, 10, 2):
pprint
(
janitor_util
.
clean_ngram
(
source
,
string
.
punctuation
,
i
))
#
pprint(janitor_util.clean_ngram(source, string.punctuation, i))
for
ngram
,
start
,
end
in
\
#
for ngram, start, end in \
janitor_util
.
clean_ngram_with_indices
(
source
,
string
.
punctuation
,
i
):
#
janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
print
(
ngram
,
"
\t
"
,
start
,
end
,
source
[
start
:
end
].
replace
(
"
\n
"
,
"
\\
n"
))
#
print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
def
test_cpp
():
#
def test_cpp():
source
=
""" ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy.
\n\n
he he he hehe heh. lastword """
*
2
#
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant
=
"dirty boy. Clean he he"
#
contaminant = "dirty boy. Clean he he"
jan_python
=
Janitor
()
#
jan_python = Janitor()
jan_cpp
=
Janitor
()
#
jan_cpp = Janitor()
jan_python
.
register_contaminant_python
(
contaminant
)
#
jan_python.register_contaminant_python(contaminant)
jan_cpp
.
register_contaminant
(
contaminant
)
#
jan_cpp.register_contaminant(contaminant)
assert
jan_python
.
dirt_ngrams
==
jan_cpp
.
dirt_ngrams
,
(
jan_python
.
dirt_ngrams
,
jan_cpp
.
dirt_ngrams
)
#
assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
assert
jan_python
.
clean_python
(
source
)
==
jan_cpp
.
clean
(
source
),
\
#
assert jan_python.clean_python(source) == jan_cpp.clean(source), \
(
jan_python
.
clean_python
(
source
),
jan_cpp
.
clean
(
source
))
#
(jan_python.clean_python(source), jan_cpp.clean(source))
print
(
"Passed test, python==cpp"
)
#
print("Passed test, python==cpp")
def
benchmark
():
#
def benchmark():
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
#
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
setup
=
\
#
setup = \
"""
#
"""
with open("data/enwik8", "r") as f:
#
with open("data/enwik8", "r") as f:
data = f.read()
#
data = f.read()
jan = Janitor(too_dirty_cutoff=1000)
#
jan = Janitor(too_dirty_cutoff=1000)
jan.register_contaminant('''
#
jan.register_contaminant('''
theories is that there is a connection between "geekdom" and autism.
#
theories is that there is a connection between "geekdom" and autism.
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
#
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
The [[Geek]] Syndrome", which is a point argued by many in the autism rights
#
The [[Geek]] Syndrome", which is a point argued by many in the autism rights
movement{{ref|Wired}}. This article, many professionals assert, is just one example of
#
movement{{ref|Wired}}. This article, many professionals assert, is just one example of
the media's application of mental disease labels to what is actually variant normal behavior
#
the media's application of mental disease labels to what is actually variant normal behavior
&mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
#
&mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
interests, even when they seem unusual to others, are not in themselves signs of autism or
#
interests, even when they seem unusual to others, are not in themselves signs of autism or
Asperger's syndrome. Others assert that it is actually the medical profession which is applying
#
Asperger's syndrome. Others assert that it is actually the medical profession which is applying
mental disease labels to children who in the past would have simply been accepted as a little
#
mental disease labels to children who in the past would have simply been accepted as a little
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
#
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
Due to the recent publicity surrounding autism and autis
#
Due to the recent publicity surrounding autism and autis
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
#
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
#
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
#
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
would last, took a cautious approach, prefering to save the revenue rather than investing it in
#
would last, took a cautious approach, prefering to save the revenue rather than investing it in
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
#
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
#
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
#
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
#
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
#
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
#
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
#
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
[[United Arab Emirates]]. After the Emirates gained independence in 1971,
#
[[United Arab Emirates]]. After the Emirates gained independence in 1971,
''')
#
''')
"""
#
"""
n
=
1
#
n = 1
print
(
f
"Timing
{
n
}
run on 100 MB"
)
#
print(f"Timing {n} run on 100 MB")
print
(
"Register contaminant"
)
#
print("Register contaminant")
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
#
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
print
(
"
\t
Cpp"
,
timeit
.
timeit
(
"jan.register_contaminant(data)"
,
setup
=
setup
,
globals
=
globals
(),
number
=
n
))
#
print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
print
(
"Clean"
)
#
print("Clean")
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
#
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
print
(
"
\t
Cpp"
,
timeit
.
timeit
(
"jan.clean(data)"
,
setup
=
setup
,
globals
=
globals
(),
number
=
n
))
#
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
def
test
():
#
def test
_janitor_general
():
source
=
""" ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy.
\n\n
he he he hehe heh. lastword """
*
2
#
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant
=
"dirty boy. Clean he he"
#
contaminant = "dirty boy. Clean he he"
jan
=
Janitor
(
ngram_n
=
3
)
#
jan = Janitor(ngram_n=3)
jan
.
register_contaminant
(
contaminant
)
#
jan.register_contaminant(contaminant)
cleaned
=
" "
.
join
(
jan
.
clean
(
source
))
#
cleaned = " ".join(jan.clean(source))
for
contam
in
jan
.
dirt_ngrams
:
#
for contam in jan.dirt_ngrams:
assert
contam
not
in
cleaned
,
contam
#
assert contam not in cleaned, contam
filename
=
"data/saved_contam"
#
filename = "data/saved_contam"
jan
.
save_contamination_ngrams
(
filename
)
#
jan.save_contamination_ngrams(filename)
jan
=
Janitor
(
ngram_n
=
3
)
#
jan = Janitor(ngram_n=3)
jan
.
load_contamination_ngrams
(
filename
)
#
jan.load_contamination_ngrams(filename)
cleaned
=
" "
.
join
(
jan
.
clean
(
source
))
#
cleaned = " ".join(jan.clean(source))
for
contam
in
jan
.
dirt_ngrams
:
#
for contam in jan.dirt_ngrams:
assert
contam
not
in
cleaned
,
contam
#
assert contam not in cleaned, contam
if
__name__
==
"__main__"
:
#
if __name__ == "__main__":
test
()
#
test()
# print_cpp()
#
# print_cpp()
# test_cpp()
#
# test_cpp()
# benchmark()
#
# benchmark()
scripts/clean_training_data/janitor.cpp
→
scripts/clean_training_data/janitor
_util
.cpp
View file @
2bb67c98
File moved
scripts/clean_training_data/process_sorted_buckets.py
0 → 100644
View file @
2bb67c98
"""
Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
unique documents with their unique document counts. Uses multiprocessing and very little memory
as we stream from presorted buckets. Will use a lot of disk though.
Arguments
---------
--working_directory (-dir)
Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
--move_dir (-move)
Directory to move processed 13grams too. Default: Do nothing
--process_count (-procs)
Number of processes to use. Default: 4
"""
import
argparse
import
glob
import
os
from
pathlib
import
Path
import
re
import
shutil
from
tqdm
import
tqdm
from
tqdm_multiprocess
import
TqdmMultiProcessPool
from
scripts.clean_training_data.archiver
import
TextReader
,
TextArchive
import
logging
from
tqdm_multiprocess.logger
import
setup_logger_tqdm
logger
=
logging
.
getLogger
(
__name__
)
# Multiprocessed
def
process_bucket
(
bucket_file_path
,
processed_directory
,
move_dir
,
tqdm_func
,
global_tqdm
):
bucket_id
=
re
.
sub
(
"\D"
,
""
,
os
.
path
.
basename
(
bucket_file_path
))
done_file
=
os
.
path
.
join
(
processed_directory
,
f
"ngram_bucket_processing_
{
bucket_id
}
.done"
)
if
os
.
path
.
exists
(
done_file
):
logger
.
info
(
f
"bucket
{
bucket_id
}
already processed, skipping"
)
return
# For managing tqdm
file_size
=
os
.
path
.
getsize
(
bucket_file_path
)
bucket_progress
=
tqdm_func
(
total
=
file_size
,
dynamic_ncols
=
True
,
unit
=
"byte"
,
unit_scale
=
1
)
current_file_position
=
0
update_frequency
=
100
*
1000000
# 100mb
update_counter
=
0
# Iterate through and output ngrams which occur in more then 10 documents
bucket
=
TextReader
(
bucket_file_path
)
output_file_path
=
bucket_file_path
+
".processed"
output_archive
=
TextArchive
(
output_file_path
,
mode
=
"wb"
)
current_ngram
=
""
current_ngram_document_ids
=
set
()
for
line
in
bucket
.
read
():
[
ngram
,
document_id
]
=
line
.
rsplit
(
" "
,
1
)
# Write ngram if more then 10 unique document occurences
if
ngram
!=
current_ngram
:
if
len
(
current_ngram_document_ids
)
>
10
:
output_archive
.
add_data
(
f
"
{
current_ngram
}
{
len
(
current_ngram_document_ids
)
}
"
)
current_ngram
=
ngram
current_ngram_document_ids
=
set
()
current_ngram_document_ids
.
add
(
document_id
)
# Update tqdm
update_counter
+=
bucket
.
fh
.
tell
()
-
current_file_position
current_file_position
=
bucket
.
fh
.
tell
()
if
update_counter
>
update_frequency
:
bucket_progress
.
update
(
update_counter
)
update_counter
=
0
# Remainder
if
len
(
current_ngram_document_ids
)
>
10
:
output_archive
.
add_data
(
f
"
{
current_ngram
}
{
len
(
current_ngram_document_ids
)
}
"
)
output_archive
.
commit
()
Path
(
done_file
).
touch
()
if
move_dir
:
shutil
.
move
(
output_file_path
,
move_dir
)
global_tqdm
.
update
()
def
process_sorted_buckets
(
working_directory
,
move_dir
,
process_count
):
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
working_directory
,
f
"*.bkt.txt.sorted"
))
processed_directory
=
os
.
path
.
join
(
working_directory
,
"processed"
)
os
.
makedirs
(
processed_directory
,
exist_ok
=
True
)
pool
=
TqdmMultiProcessPool
(
process_count
)
tasks
=
[(
process_bucket
,
(
bucket_file
,
processed_directory
,
move_dir
))
for
bucket_file
in
bucket_file_paths
]
global_tqdm
=
tqdm
(
total
=
len
(
bucket_file_paths
),
dynamic_ncols
=
True
,
unit
=
"bucket"
)
on_done
=
lambda
_
:
None
on_error
=
lambda
_
:
None
_
=
pool
.
map
(
global_tqdm
,
tasks
,
on_error
,
on_done
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Process 13 grams from sorted buckets.'
)
parser
.
add_argument
(
"-dir"
,
"--working_directory"
,
default
=
""
)
parser
.
add_argument
(
"-move"
,
"--move_dir"
,
default
=
""
)
parser
.
add_argument
(
"-procs"
,
"--process_count"
,
type
=
int
,
default
=
4
)
if
__name__
==
'__main__'
:
logfile_path
=
"process13grams.log"
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
process_sorted_buckets
(
args
.
working_directory
,
args
.
move_dir
,
args
.
process_count
)
\ No newline at end of file
scripts/clean_training_data/sort_13_gram_buckets.py
0 → 100644
View file @
2bb67c98
"""
Iteratively runs gnu sort on each bucket, gnu handles the multiprocessing.
Arguments
---------
--working_directory (-dir)
Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
directory and the unsorted buckets are removed after.
"""
import
glob
import
argparse
import
os
from
pathlib
import
Path
import
signal
from
signal
import
SIGINT
import
re
import
subprocess
from
tqdm
import
tqdm
import
logging
from
tqdm_multiprocess.logger
import
setup_logger_tqdm
logger
=
logging
.
getLogger
(
__name__
)
terminate
=
False
def
handler
(
signal_received
,
frame
):
global
terminate
terminate
=
True
def
sort_13_gram_buckets
(
working_directory
):
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
working_directory
,
f
"*.bkt.txt"
))
for
bucket_file_path
in
tqdm
(
bucket_file_paths
,
dynamic_ncols
=
True
):
bucket_id
=
re
.
sub
(
"\D"
,
""
,
os
.
path
.
basename
(
bucket_file_path
))
done_file
=
os
.
path
.
join
(
working_directory
,
f
"ngram_bucket_sorting_
{
bucket_id
}
.done"
)
if
os
.
path
.
exists
(
done_file
):
logger
.
info
(
f
"bucket
{
bucket_id
}
already processed, skipping"
)
return
sorted_file_path
=
bucket_file_path
+
".sorted"
command
=
f
"sort
{
bucket_file_path
}
>
{
sorted_file_path
}
"
logger
.
info
(
command
)
subprocess
.
call
(
command
,
shell
=
True
)
if
terminate
:
return
Path
(
done_file
).
touch
()
os
.
remove
(
bucket_file_path
)
parser
=
argparse
.
ArgumentParser
(
description
=
'sort 13gram buckets'
)
parser
.
add_argument
(
"-dir"
,
"--working_directory"
,
default
=
""
)
if
__name__
==
'__main__'
:
# Handle sigint (ctrl-c) cleanly
previous_signal_int
=
signal
.
signal
(
SIGINT
,
handler
)
logfile_path
=
"sort13grambuckets.log"
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
sort_13_gram_buckets
(
args
.
working_directory
)
\ No newline at end of file
setup.py
View file @
2bb67c98
...
@@ -33,5 +33,10 @@ setuptools.setup(
...
@@ -33,5 +33,10 @@ setuptools.setup(
"pycountry==20.7.3"
,
"pycountry==20.7.3"
,
"numexpr==2.7.2"
,
"numexpr==2.7.2"
,
"lm_dataformat==0.0.19"
,
"lm_dataformat==0.0.19"
,
"pytest==6.2.3"
,
"pybind11==2.6.2"
,
"tqdm-multiprocess==0.0.11"
,
"zstandard==0.15.2"
,
"jsonlines==2.0.0"
]
]
)
)
tests/test_generate_13_grams.py
0 → 100644
View file @
2bb67c98
import
os
from
collections
import
Counter
import
shutil
import
glob
from
scripts.clean_training_data.janitor
import
*
from
scripts.clean_training_data.generate_13_grams
import
do_ngrams_in_buckets
from
scripts.clean_training_data.archiver
import
Archive
,
TextReader
def
test_generate_13_grams_1
():
data
=
"""A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
Some other birds, mostly related to the shelducks, have "goose" as part of their names.
More distantly related members of the family Anatidae are swans, most of which are larger
than true geese, and ducks, which are smaller. The term "goose" may refer to either a male
or female bird, but when paired with "gander", refers specifically to a female one (the latter referring
to a male). Young birds before fledging are called goslings. The collective noun for a group of
geese on the ground is a gaggle; when in flight, they are called a skein, a team, or a wedge; when
flying close together, they are called a plump."""
data
=
data
+
data
# Simple Generation
n
=
13
janitor
=
Janitor
()
ngrams
=
word_ngrams
(
janitor
.
normalize_string
(
data
),
n
)
comparison
=
list
(
ngrams
)
comparison_counter
=
Counter
(
comparison
)
print
(
len
(
comparison
))
# print(comparison)
# Generating into buckets
test_working_directory
=
"test_generate_13_grams"
output_directory
=
os
.
path
.
join
(
test_working_directory
,
"output"
)
try
:
shutil
.
rmtree
(
output_directory
)
except
FileNotFoundError
:
pass
os
.
makedirs
(
test_working_directory
,
exist_ok
=
True
)
archive
=
Archive
(
os
.
path
.
join
(
test_working_directory
,
"test.jsonl.zst"
))
archive
.
add_data
(
data
)
archive
.
commit
()
bucket_count
=
4
do_ngrams_in_buckets
(
n
,
test_working_directory
,
bucket_count
)
# Rebuild from buckets
rebuilt_ngrams
=
[]
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
test_working_directory
,
"output"
,
f
"*.bkt.txt"
))
for
bucket_file_path
in
bucket_file_paths
:
reader
=
TextReader
(
bucket_file_path
)
for
line
in
reader
.
read
():
[
ngram
,
document_id
]
=
line
.
rsplit
(
" "
,
1
)
rebuilt_ngrams
.
append
(
ngram
)
# Compare
result_counter
=
Counter
(
rebuilt_ngrams
)
# print(len(result_counter))
# print(len(comparison_counter))
assert
(
len
(
result_counter
)
==
len
(
comparison_counter
))
# print(result_counter)
# print(comparison_counter)
assert
(
comparison_counter
==
result_counter
)
\ No newline at end of file
tests/test_janitor.py
0 → 100644
View file @
2bb67c98
import
re
from
collections
import
defaultdict
from
scripts.clean_training_data.janitor
import
*
def
simple_ngram
(
sequence
,
n
):
ngrams
=
list
()
ngram
=
[]
for
x
in
sequence
:
ngram
.
append
(
x
)
if
len
(
ngram
)
==
n
:
ngrams
.
append
(
tuple
(
ngram
))
ngram
=
ngram
[
1
:]
return
ngrams
def
test_form_ngrams
():
sequence
=
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
\
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
n_values
=
[
1
,
2
,
3
,
5
,
13
]
for
n
in
n_values
:
comparison
=
simple_ngram
(
sequence
,
n
)
result_to_test
=
list
(
form_ngrams
(
iter
(
sequence
),
n
))
assert
len
(
comparison
)
==
len
(
result_to_test
)
assert
comparison
==
result_to_test
def
test_word_ngrams
():
sequence
=
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
\
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
words
=
sequence
.
split
()
n_values
=
[
1
,
2
,
3
,
5
,
13
]
for
n
in
n_values
:
comparison
=
simple_ngram
(
words
,
n
)
comparison
=
[
" "
.
join
(
ngram
)
for
ngram
in
comparison
]
result_to_test
=
list
(
word_ngrams
(
sequence
,
n
))
assert
len
(
comparison
)
==
len
(
result_to_test
)
assert
result_to_test
==
comparison
def
test_split_indices
():
sequence
=
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
\
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
comparison
=
[]
current_word
=
""
for
i
,
c
in
enumerate
(
sequence
):
if
c
!=
" "
:
current_word
+=
c
else
:
if
current_word
:
comparison
.
append
((
current_word
,
(
i
-
len
(
current_word
),
i
-
1
)))
current_word
=
""
if
current_word
:
comparison
.
append
((
current_word
,
(
len
(
sequence
)
-
len
(
current_word
),
len
(
sequence
)
-
1
)))
current_word
=
""
result_to_test
=
list
(
split_indices
(
sequence
))
assert
len
(
comparison
)
==
len
(
result_to_test
)
assert
(
comparison
==
result_to_test
)
def
test_word_ngrams_indices
():
sequence
=
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
\
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
n_values
=
[
1
,
2
,
3
,
5
,
13
]
for
n
in
n_values
:
ngrams
=
[
" "
.
join
(
ngram
)
for
ngram
in
simple_ngram
(
sequence
.
split
(),
n
)]
tracker
=
defaultdict
(
int
)
comparison
=
[]
for
ngram
in
ngrams
:
while
True
:
start
=
sequence
.
find
(
ngram
,
tracker
[
ngram
])
assert
start
!=
-
1
# testing the test
end
=
start
+
len
(
ngram
)
-
1
tracker
[
ngram
]
=
end
+
1
# ignore partial word matches
if
(
start
!=
0
and
sequence
[
start
-
1
]
!=
" "
)
or
\
(
end
!=
len
(
sequence
)
-
1
and
sequence
[
end
+
1
]
!=
" "
):
pass
else
:
break
comparison
.
append
((
ngram
,
(
start
,
end
)))
result_to_test
=
list
(
word_ngrams_indices
(
sequence
,
n
))
assert
len
(
result_to_test
)
==
len
(
comparison
)
assert
result_to_test
==
comparison
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def
test_janitor1
():
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth
=
"filth"
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
janitor
=
Janitor
(
ngram_n
=
1
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor2
():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth
=
"filth"
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
\
" characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor
=
Janitor
(
ngram_n
=
1
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor3
():
# Same test as above but with a 6gram.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth
=
"filth lots of dirty filthy filth"
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
\
" characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor4
():
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filth
=
"filth lots of dirty filthy filth"
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
\
" characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
{
filth
}
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor5
():
# Same as above but using multiple different filth 6grams.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
\
" characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
for
filth
in
filths
:
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
set
(
filths
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor6
():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
expected_result
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing "
\
" characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
for
filth
in
filths
:
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
set
(
filths
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor7
():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence
=
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"FILTH. lots of dirty filtHy FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"FILTH. lots of filtHy dirty FIlTh "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
\
"This is a @line #containing a certain number of characters, 76 to be exact. "
filths
=
[
"filth lots of dirty filthy filth"
,
"filth lots of filthy dirty filth"
]
expected_result
=
""
janitor
=
Janitor
(
ngram_n
=
6
,
window_to_remove
=
200
,
too_dirty_cutoff
=
10
,
minimum_slice_length
=
200
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
sequence
for
filth
in
filths
:
janitor
.
register_contaminant
(
filth
)
assert
janitor
.
dirt_ngrams
==
set
(
filths
)
result
=
janitor
.
clean_python
(
sequence
)
result
=
""
.
join
(
result
)
assert
result
==
expected_result
def
test_janitor8
():
# This will test the save and load contams
pass
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan = Janitor(ngram_n=3)
# jan.register_contaminant(contaminant)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# filename = "data/saved_contam"
# jan.save_contamination_ngrams(filename)
# jan = Janitor(ngram_n=3)
# jan.load_contamination_ngrams(filename)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment