Commit d0277a3f authored by researcher2's avatar researcher2
Browse files

Update janitor.py

Comment out cpp tests.
parent 7cb91943
...@@ -213,77 +213,77 @@ class Janitor: ...@@ -213,77 +213,77 @@ class Janitor:
# Tests # Tests
################################################################# #################################################################
def print_cpp(): # def print_cpp():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 # source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
for i in range(1, 10, 2): # for i in range(1, 10, 2):
pprint(janitor_util.clean_ngram(source, string.punctuation, i)) # pprint(janitor_util.clean_ngram(source, string.punctuation, i))
for ngram, start, end in \ # for ngram, start, end in \
janitor_util.clean_ngram_with_indices(source, string.punctuation, i): # janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n")) # print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
def test_cpp(): # def test_cpp():
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 # source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant = "dirty boy. Clean he he" # contaminant = "dirty boy. Clean he he"
jan_python = Janitor() # jan_python = Janitor()
jan_cpp = Janitor() # jan_cpp = Janitor()
jan_python.register_contaminant_python(contaminant) # jan_python.register_contaminant_python(contaminant)
jan_cpp.register_contaminant(contaminant) # jan_cpp.register_contaminant(contaminant)
assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams) # assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
assert jan_python.clean_python(source) == jan_cpp.clean(source), \ # assert jan_python.clean_python(source) == jan_cpp.clean(source), \
(jan_python.clean_python(source), jan_cpp.clean(source)) # (jan_python.clean_python(source), jan_cpp.clean(source))
print("Passed test, python==cpp") # print("Passed test, python==cpp")
def benchmark(): # def benchmark():
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html # # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
setup = \ # setup = \
""" # """
with open("data/enwik8", "r") as f: # with open("data/enwik8", "r") as f:
data = f.read() # data = f.read()
jan = Janitor(too_dirty_cutoff=1000) # jan = Janitor(too_dirty_cutoff=1000)
jan.register_contaminant(''' # jan.register_contaminant('''
theories is that there is a connection between "geekdom" and autism. # theories is that there is a connection between "geekdom" and autism.
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled " # This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
The [[Geek]] Syndrome", which is a point argued by many in the autism rights # The [[Geek]] Syndrome", which is a point argued by many in the autism rights
movement{{ref|Wired}}. This article, many professionals assert, is just one example of # movement{{ref|Wired}}. This article, many professionals assert, is just one example of
the media's application of mental disease labels to what is actually variant normal behavior # the media's application of mental disease labels to what is actually variant normal behavior
—they argue that shyness, lack of athletic ability or social skills, and intellectual # —they argue that shyness, lack of athletic ability or social skills, and intellectual
interests, even when they seem unusual to others, are not in themselves signs of autism or # interests, even when they seem unusual to others, are not in themselves signs of autism or
Asperger's syndrome. Others assert that it is actually the medical profession which is applying # Asperger's syndrome. Others assert that it is actually the medical profession which is applying
mental disease labels to children who in the past would have simply been accepted as a little # mental disease labels to children who in the past would have simply been accepted as a little
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue. # different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
Due to the recent publicity surrounding autism and autis # Due to the recent publicity surrounding autism and autis
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first, # ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first # oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties # paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
would last, took a cautious approach, prefering to save the revenue rather than investing it in # would last, took a cautious approach, prefering to save the revenue rather than investing it in
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential # development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his # to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]], # brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M, # with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), # ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the # ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the # Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
[[United Arab Emirates]]. After the Emirates gained independence in 1971, # [[United Arab Emirates]]. After the Emirates gained independence in 1971,
''') # ''')
""" # """
n = 1 # n = 1
print(f"Timing {n} run on 100 MB") # print(f"Timing {n} run on 100 MB")
print("Register contaminant") # print("Register contaminant")
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n)) # # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n)) # print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
print("Clean") # print("Clean")
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n)) # # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n)) # print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
# def test_janitor_general(): # def test_janitor_general():
...@@ -306,8 +306,8 @@ def benchmark(): ...@@ -306,8 +306,8 @@ def benchmark():
# assert contam not in cleaned, contam # assert contam not in cleaned, contam
if __name__ == "__main__": # if __name__ == "__main__":
test() # test()
# print_cpp() # # print_cpp()
# test_cpp() # # test_cpp()
# benchmark() # # benchmark()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment