"vscode:/vscode.git/clone" did not exist on "954f4e6bd607ae8ed08cc60dab7c8117e1ff1776"
Commit 7cb91943 authored by researcher2's avatar researcher2
Browse files

Janitor testing and changes

Small fixes to python version of Janitor, cpp version named properly for import but code doesn't work - may or may not fix in the future.

Add unit tests for python version of Janitor.
parent 2a11129d
...@@ -41,6 +41,29 @@ def word_ngrams(s, n): ...@@ -41,6 +41,29 @@ def word_ngrams(s, n):
ngram_seqs = form_ngrams(iter(tokens), n) ngram_seqs = form_ngrams(iter(tokens), n)
return (" ".join(ngram) for ngram in ngram_seqs) return (" ".join(ngram) for ngram in ngram_seqs)
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
# current_word = ""
# history = []
# gap = False;
# start = 0
# end = 0
# for character in sequence:
# if character == " ":
# if not gap:
# gap = True
# history.append(current_word)
# end += len(current_word) - 1
# current_word = ""
# if len(history) == n:
# yield (tuple(history), start, end)
# del history[0]
# start = end + 1
# end = start
# else:
# gap = False
# current_word += character
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python # https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s): def split_indices(s):
...@@ -140,8 +163,9 @@ class Janitor: ...@@ -140,8 +163,9 @@ class Janitor:
def _split_chunks(self, dirty_string, dirty_parts): def _split_chunks(self, dirty_string, dirty_parts):
clean_chunks = [] clean_chunks = []
splice_idx = 0 splice_idx = 0
end = -1
for i, (ngram, start, end) in enumerate(dirty_parts): for i, (ngram, start, end) in enumerate(dirty_parts):
if i > self.too_dirty_cutoff: if i >= self.too_dirty_cutoff:
return [] return []
start = max(0, start - self.window_to_remove) start = max(0, start - self.window_to_remove)
end = min(len(dirty_string), end + self.window_to_remove) end = min(len(dirty_string), end + self.window_to_remove)
...@@ -150,6 +174,9 @@ class Janitor: ...@@ -150,6 +174,9 @@ class Janitor:
clean_chunks.append(dirty_string[splice_idx: start]) clean_chunks.append(dirty_string[splice_idx: start])
splice_idx = end splice_idx = end
if end < len(dirty_string) - self.minimum_slice_length:
clean_chunks.append(dirty_string[end+1:])
return clean_chunks return clean_chunks
############## ##############
...@@ -259,24 +286,24 @@ def benchmark(): ...@@ -259,24 +286,24 @@ def benchmark():
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n)) print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
def test(): # def test_janitor_general():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 # source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant = "dirty boy. Clean he he" # contaminant = "dirty boy. Clean he he"
jan = Janitor(ngram_n=3) # jan = Janitor(ngram_n=3)
jan.register_contaminant(contaminant) # jan.register_contaminant(contaminant)
cleaned = " ".join(jan.clean(source)) # cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams: # for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam # assert contam not in cleaned, contam
filename = "data/saved_contam" # filename = "data/saved_contam"
jan.save_contamination_ngrams(filename) # jan.save_contamination_ngrams(filename)
jan = Janitor(ngram_n=3) # jan = Janitor(ngram_n=3)
jan.load_contamination_ngrams(filename) # jan.load_contamination_ngrams(filename)
cleaned = " ".join(jan.clean(source)) # cleaned = " ".join(jan.clean(source))
for contam in jan.dirt_ngrams: # for contam in jan.dirt_ngrams:
assert contam not in cleaned, contam # assert contam not in cleaned, contam
if __name__ == "__main__": if __name__ == "__main__":
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment