Commit 7cb91943 authored by researcher2's avatar researcher2
Browse files

Janitor testing and changes

Small fixes to python version of Janitor, cpp version named properly for import but code doesn't work - may or may not fix in the future.

Add unit tests for python version of Janitor.
parent 2a11129d
...@@ -41,6 +41,29 @@ def word_ngrams(s, n): ...@@ -41,6 +41,29 @@ def word_ngrams(s, n):
ngram_seqs = form_ngrams(iter(tokens), n) ngram_seqs = form_ngrams(iter(tokens), n)
return (" ".join(ngram) for ngram in ngram_seqs) return (" ".join(ngram) for ngram in ngram_seqs)
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
# current_word = ""
# history = []
# gap = False;
# start = 0
# end = 0
# for character in sequence:
# if character == " ":
# if not gap:
# gap = True
# history.append(current_word)
# end += len(current_word) - 1
# current_word = ""
# if len(history) == n:
# yield (tuple(history), start, end)
# del history[0]
# start = end + 1
# end = start
# else:
# gap = False
# current_word += character
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s): def split_indices(s):
...@@ -140,8 +163,9 @@ class Janitor: ...@@ -140,8 +163,9 @@ class Janitor:
def _split_chunks(self, dirty_string, dirty_parts): def _split_chunks(self, dirty_string, dirty_parts):
clean_chunks = [] clean_chunks = []
splice_idx = 0 splice_idx = 0
end = -1
for i, (ngram, start, end) in enumerate(dirty_parts): for i, (ngram, start, end) in enumerate(dirty_parts):
if i > self.too_dirty_cutoff: if i >= self.too_dirty_cutoff:
return [] return []
start = max(0, start - self.window_to_remove) start = max(0, start - self.window_to_remove)
end = min(len(dirty_string), end + self.window_to_remove) end = min(len(dirty_string), end + self.window_to_remove)
...@@ -150,6 +174,9 @@ class Janitor: ...@@ -150,6 +174,9 @@ class Janitor:
clean_chunks.append(dirty_string[splice_idx: start]) clean_chunks.append(dirty_string[splice_idx: start])
splice_idx = end splice_idx = end
if end < len(dirty_string) - self.minimum_slice_length:
clean_chunks.append(dirty_string[end+1:])
return clean_chunks return clean_chunks
############## ##############
...@@ -259,24 +286,24 @@ def benchmark(): ...@@ -259,24 +286,24 @@ def benchmark():
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n)) print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
def test(): # def test_janitor_general():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 # source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant = "dirty boy. Clean he he" # contaminant = "dirty boy. Clean he he"
jan = Janitor(ngram_n=3) # jan = Janitor(ngram_n=3)
jan.register_contaminant(contaminant) # jan.register_contaminant(contaminant)
cleaned = " ".join(jan.clean(source)) # cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams: # for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam # assert contam not in cleaned, contam
filename = "data/saved_contam" # filename = "data/saved_contam"
jan.save_contamination_ngrams(filename) # jan.save_contamination_ngrams(filename)
jan = Janitor(ngram_n=3) # jan = Janitor(ngram_n=3)
jan.load_contamination_ngrams(filename) # jan.load_contamination_ngrams(filename)
cleaned = " ".join(jan.clean(source)) # cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams: # for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam # assert contam not in cleaned, contam
if __name__ == "__main__": if __name__ == "__main__":
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment