janitor.py 12.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
import re
import string
import timeit
import pickle
import traceback
from pprint import pprint

# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
try:
    import janitor_util
bzantium's avatar
bzantium committed
12

13
    JANITOR_CPP = True
bzantium's avatar
bzantium committed
14
except Exception:
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
    traceback.print_exc()
    JANITOR_CPP = False


# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def form_ngrams(sequence, n):
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
        try:
            next_item = next(sequence)
        except StopIteration:
            # no more data, terminate the generator
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]


def word_ngrams(s, n):
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
    return (" ".join(ngram) for ngram in ngram_seqs)

bzantium's avatar
bzantium committed
45

researcher2's avatar
researcher2 committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
#     current_word = ""
#     history = []
#     gap = False;
#     start = 0
#     end = 0
#     for character in sequence:
#         if character == " ":
#             if not gap:
#                 gap = True
#                 history.append(current_word)
#                 end += len(current_word) - 1
#                 current_word = ""
#                 if len(history) == n:
#                     yield (tuple(history), start, end)
#                     del history[0]
#                     start = end + 1
#                     end = start
#         else:
#             gap = False
#             current_word += character

69
70
71
72
73
74

# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s):
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
bzantium's avatar
bzantium committed
75
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


def word_ngrams_indices(s, n):
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)

    # Generator of ngrams of (word, idx_pairs)
    # (
    #   [(word, (start,end)), (word, (start, end))...],
    #   [(word, (start, end)), ...],
    #   ...
    # )
    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)

    # Generator of pairs of word and index ngrams
    # (
    #   ([word, word, ...], [(start,end), (start,end), ...]),
    #   ...
    # )
bzantium's avatar
bzantium committed
95
96
97
    ngram_indices_pairs = (
        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
    )
98
99

    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
bzantium's avatar
bzantium committed
100
101
102
103
    return (
        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
        for ngram_seq, indices in ngram_indices_pairs
    )
104
105
106
107
108
109


class Janitor:

    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
bzantium's avatar
bzantium committed
110
111
112
113
114
115
        self,
        ngram_n=13,
        window_to_remove=200,
        too_dirty_cutoff=10,
        minimum_slice_length=200,
        delete_chars=string.punctuation,
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    ):
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
        self.minimum_slice_length = minimum_slice_length
        self.delete_chars = delete_chars

        self.dirt_ngrams = set()

        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
        # This is fast by python standards
        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
        self.translation_table = str.maketrans(
            string.ascii_lowercase + string.ascii_uppercase,  # These characters
            string.ascii_lowercase * 2,  # Become these characters
bzantium's avatar
bzantium committed
131
            self.delete_chars,  # These are deleted
132
133
134
135
136
137
138
        )

    ##############
    # I/O for saving contamination ngrams
    ##############

    def save_contamination_ngrams(self, filename):
bzantium's avatar
bzantium committed
139
        with open(filename, "wb") as fp:
140
141
142
            pickle.dump(filename, fp)

    def load_contamination_ngrams(self, filename):
bzantium's avatar
bzantium committed
143
        with open(filename, "rb") as fp:
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
            self.dirt_ngrams = pickle.load(fp)

    ##############
    # Call these :)
    ##############

    def register_contaminant(self, dirt_string):
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
            return self.register_contaminant_cpp(dirt_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)

    def clean(self, dirty_string):
        """Clean a string (e.g. a training set) by removing all ngrams previously
bzantium's avatar
bzantium committed
161
        registered as contaminants. Returns a list of clean chunks, or empty if
162
163
164
165
166
167
168
169
170
171
        the string was too dirty"""
        if JANITOR_CPP:
            return self.clean_cpp(dirty_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)

    def _split_chunks(self, dirty_string, dirty_parts):
        clean_chunks = []
        splice_idx = 0
researcher2's avatar
researcher2 committed
172
        end = -1
173
        for i, (ngram, start, end) in enumerate(dirty_parts):
researcher2's avatar
researcher2 committed
174
            if i >= self.too_dirty_cutoff:
175
176
177
178
179
                return []
            start = max(0, start - self.window_to_remove)
            end = min(len(dirty_string), end + self.window_to_remove)

            if start - splice_idx > self.minimum_slice_length:
bzantium's avatar
bzantium committed
180
                clean_chunks.append(dirty_string[splice_idx:start])
181
182
            splice_idx = end

researcher2's avatar
researcher2 committed
183
        if end < len(dirty_string) - self.minimum_slice_length:
bzantium's avatar
bzantium committed
184
            clean_chunks.append(dirty_string[end + 1 :])
researcher2's avatar
researcher2 committed
185

186
187
188
189
190
191
192
        return clean_chunks

    ##############
    # Fast C++
    ##############

    def register_contaminant_cpp(self, dirt_string):
bzantium's avatar
bzantium committed
193
194
195
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )
196
197

    def clean_cpp(self, dirty_string):
bzantium's avatar
bzantium committed
198
199
200
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
201
202
203
204
205
206
207
208
209
210
        return self._split_chunks(dirty_string, contamination_indices)

    ##############
    # Slow python
    ##############

    def normalize_string(self, s):
        return s.translate(self.translation_table)

    def register_contaminant_python(self, dirt_string):
bzantium's avatar
bzantium committed
211
212
213
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )
214
215
216
217
218
219
220
221
222
223
224
225
226
227

    def clean_python(self, dirty_string):
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
        )
        return self._split_chunks(dirty_string, contamination_indices)


##################################################################
# Tests
#################################################################

researcher2's avatar
researcher2 committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# def print_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2

#     for i in range(1, 10, 2):
#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
#         for ngram, start, end in \
#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))


# def test_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"

#     jan_python = Janitor()
#     jan_cpp = Janitor()

#     jan_python.register_contaminant_python(contaminant)
#     jan_cpp.register_contaminant(contaminant)

#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)

#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
#         (jan_python.clean_python(source), jan_cpp.clean(source))

#     print("Passed test, python==cpp")


# def benchmark():
#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
#     setup = \
#         """
#         with open("data/enwik8", "r") as f:
#             data = f.read()
#         jan = Janitor(too_dirty_cutoff=1000)
#         jan.register_contaminant('''
bzantium's avatar
bzantium committed
264
#         theories is that there is a connection between &quot;geekdom&quot; and autism.
researcher2's avatar
researcher2 committed
265
#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
bzantium's avatar
bzantium committed
266
267
#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
researcher2's avatar
researcher2 committed
268
269
#         the media's application of mental disease labels to what is actually variant normal behavior
#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
bzantium's avatar
bzantium committed
270
#         interests, even when they seem unusual to others, are not in themselves signs of autism or
researcher2's avatar
researcher2 committed
271
272
273
274
275
#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
#         mental disease labels to children who in the past would have simply been accepted as a little
#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
#         Due to the recent publicity surrounding autism and autis
#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
bzantium's avatar
bzantium committed
276
277
278
279
280
281
282
283
284
285
286
#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
researcher2's avatar
researcher2 committed
287
288
289
290
291
292
293
294
295
296
297
298
#         ''')
#         """

#     n = 1
#     print(f"Timing {n} run on 100 MB")
#     print("Register contaminant")
#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))

#     print("Clean")
#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
299
300


researcher2's avatar
researcher2 committed
301
302
303
# def test_janitor_general():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"
304

researcher2's avatar
researcher2 committed
305
306
307
308
309
#     jan = Janitor(ngram_n=3)
#     jan.register_contaminant(contaminant)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
310

researcher2's avatar
researcher2 committed
311
312
#     filename = "data/saved_contam"
#     jan.save_contamination_ngrams(filename)
313

researcher2's avatar
researcher2 committed
314
315
316
317
318
#     jan = Janitor(ngram_n=3)
#     jan.load_contamination_ngrams(filename)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
319
320


researcher2's avatar
researcher2 committed
321
322
323
324
325
# if __name__ == "__main__":
#     test()
#     # print_cpp()
#     # test_cpp()
#     # benchmark()