Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d0277a3f
Commit
d0277a3f
authored
May 12, 2021
by
researcher2
Browse files
Update janitor.py
Comment out cpp tests.
parent
7cb91943
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
76 additions
and
76 deletions
+76
-76
scripts/clean_training_data/janitor.py
scripts/clean_training_data/janitor.py
+76
-76
No files found.
scripts/clean_training_data/janitor.py
View file @
d0277a3f
...
@@ -213,77 +213,77 @@ class Janitor:
...
@@ -213,77 +213,77 @@ class Janitor:
# Tests
# Tests
#################################################################
#################################################################
def
print_cpp
():
#
def print_cpp():
source
=
""" ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy.
\n\n
he he he hehe heh. lastword """
*
2
#
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
for
i
in
range
(
1
,
10
,
2
):
#
for i in range(1, 10, 2):
pprint
(
janitor_util
.
clean_ngram
(
source
,
string
.
punctuation
,
i
))
#
pprint(janitor_util.clean_ngram(source, string.punctuation, i))
for
ngram
,
start
,
end
in
\
#
for ngram, start, end in \
janitor_util
.
clean_ngram_with_indices
(
source
,
string
.
punctuation
,
i
):
#
janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
print
(
ngram
,
"
\t
"
,
start
,
end
,
source
[
start
:
end
].
replace
(
"
\n
"
,
"
\\
n"
))
#
print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
def
test_cpp
():
#
def test_cpp():
source
=
""" ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy.
\n\n
he he he hehe heh. lastword """
*
2
#
source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
contaminant
=
"dirty boy. Clean he he"
#
contaminant = "dirty boy. Clean he he"
jan_python
=
Janitor
()
#
jan_python = Janitor()
jan_cpp
=
Janitor
()
#
jan_cpp = Janitor()
jan_python
.
register_contaminant_python
(
contaminant
)
#
jan_python.register_contaminant_python(contaminant)
jan_cpp
.
register_contaminant
(
contaminant
)
#
jan_cpp.register_contaminant(contaminant)
assert
jan_python
.
dirt_ngrams
==
jan_cpp
.
dirt_ngrams
,
(
jan_python
.
dirt_ngrams
,
jan_cpp
.
dirt_ngrams
)
#
assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
assert
jan_python
.
clean_python
(
source
)
==
jan_cpp
.
clean
(
source
),
\
#
assert jan_python.clean_python(source) == jan_cpp.clean(source), \
(
jan_python
.
clean_python
(
source
),
jan_cpp
.
clean
(
source
))
#
(jan_python.clean_python(source), jan_cpp.clean(source))
print
(
"Passed test, python==cpp"
)
#
print("Passed test, python==cpp")
def
benchmark
():
#
def benchmark():
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
#
# Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
setup
=
\
#
setup = \
"""
#
"""
with open("data/enwik8", "r") as f:
#
with open("data/enwik8", "r") as f:
data = f.read()
#
data = f.read()
jan = Janitor(too_dirty_cutoff=1000)
#
jan = Janitor(too_dirty_cutoff=1000)
jan.register_contaminant('''
#
jan.register_contaminant('''
theories is that there is a connection between "geekdom" and autism.
#
theories is that there is a connection between "geekdom" and autism.
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
#
This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
The [[Geek]] Syndrome", which is a point argued by many in the autism rights
#
The [[Geek]] Syndrome", which is a point argued by many in the autism rights
movement{{ref|Wired}}. This article, many professionals assert, is just one example of
#
movement{{ref|Wired}}. This article, many professionals assert, is just one example of
the media's application of mental disease labels to what is actually variant normal behavior
#
the media's application of mental disease labels to what is actually variant normal behavior
—they argue that shyness, lack of athletic ability or social skills, and intellectual
#
—they argue that shyness, lack of athletic ability or social skills, and intellectual
interests, even when they seem unusual to others, are not in themselves signs of autism or
#
interests, even when they seem unusual to others, are not in themselves signs of autism or
Asperger's syndrome. Others assert that it is actually the medical profession which is applying
#
Asperger's syndrome. Others assert that it is actually the medical profession which is applying
mental disease labels to children who in the past would have simply been accepted as a little
#
mental disease labels to children who in the past would have simply been accepted as a little
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
#
different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
Due to the recent publicity surrounding autism and autis
#
Due to the recent publicity surrounding autism and autis
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
#
ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
#
oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
#
paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
would last, took a cautious approach, prefering to save the revenue rather than investing it in
#
would last, took a cautious approach, prefering to save the revenue rather than investing it in
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
#
development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
#
to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
#
brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
#
with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
#
''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
#
ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
#
Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
[[United Arab Emirates]]. After the Emirates gained independence in 1971,
#
[[United Arab Emirates]]. After the Emirates gained independence in 1971,
''')
#
''')
"""
#
"""
n
=
1
#
n = 1
print
(
f
"Timing
{
n
}
run on 100 MB"
)
#
print(f"Timing {n} run on 100 MB")
print
(
"Register contaminant"
)
#
print("Register contaminant")
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
#
# print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
print
(
"
\t
Cpp"
,
timeit
.
timeit
(
"jan.register_contaminant(data)"
,
setup
=
setup
,
globals
=
globals
(),
number
=
n
))
#
print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
print
(
"Clean"
)
#
print("Clean")
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
#
# print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
print
(
"
\t
Cpp"
,
timeit
.
timeit
(
"jan.clean(data)"
,
setup
=
setup
,
globals
=
globals
(),
number
=
n
))
#
print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
# def test_janitor_general():
# def test_janitor_general():
...
@@ -306,8 +306,8 @@ def benchmark():
...
@@ -306,8 +306,8 @@ def benchmark():
# assert contam not in cleaned, contam
# assert contam not in cleaned, contam
if
__name__
==
"__main__"
:
#
if __name__ == "__main__":
test
()
#
test()
# print_cpp()
#
# print_cpp()
# test_cpp()
#
# test_cpp()
# benchmark()
#
# benchmark()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment