janitor.py 12.7 KB
Newer Older
1
import pickle
2
3
4
import re
import string
import traceback
5
6
from typing import Iterator, List, Sequence, Tuple, TypeVar

7

8
9
10
# This is a cpp module.
# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp

11
12
try:
    import janitor_util
Fabrizio Milo's avatar
Fabrizio Milo committed
13

14
    JANITOR_CPP = True
15
except Exception:
16
17
18
19
    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
    traceback.print_exc()
    JANITOR_CPP = False

Ethan Smith's avatar
Ethan Smith committed
20
T = TypeVar("T")
21

Ethan Smith's avatar
Ethan Smith committed
22

23
24
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
25
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
        try:
            next_item = next(sequence)
        except StopIteration:
            # no more data, terminate the generator
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]


Ethan Smith's avatar
Ethan Smith committed
42
def word_ngrams(s: str, n: int) -> Iterator[str]:
43
44
45
46
47
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
    return (" ".join(ngram) for ngram in ngram_seqs)

Fabrizio Milo's avatar
Fabrizio Milo committed
48

researcher2's avatar
researcher2 committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
#     current_word = ""
#     history = []
#     gap = False;
#     start = 0
#     end = 0
#     for character in sequence:
#         if character == " ":
#             if not gap:
#                 gap = True
#                 history.append(current_word)
#                 end += len(current_word) - 1
#                 current_word = ""
#                 if len(history) == n:
#                     yield (tuple(history), start, end)
#                     del history[0]
#                     start = end + 1
#                     end = start
#         else:
#             gap = False
#             current_word += character

72
73

# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
74
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
75
76
77
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
Fabrizio Milo's avatar
Fabrizio Milo committed
78
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
79
80


81
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)

    # Generator of ngrams of (word, idx_pairs)
    # (
    #   [(word, (start,end)), (word, (start, end))...],
    #   [(word, (start, end)), ...],
    #   ...
    # )
    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)

    # Generator of pairs of word and index ngrams
    # (
    #   ([word, word, ...], [(start,end), (start,end), ...]),
    #   ...
    # )
Fabrizio Milo's avatar
Fabrizio Milo committed
98
99
100
    ngram_indices_pairs = (
        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
    )
101
102

    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
Fabrizio Milo's avatar
Fabrizio Milo committed
103
104
105
106
    return (
        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
        for ngram_seq, indices in ngram_indices_pairs
    )
107
108
109
110
111


class Janitor:
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
Fabrizio Milo's avatar
Fabrizio Milo committed
112
        self,
Ethan Smith's avatar
Ethan Smith committed
113
114
115
116
117
118
        ngram_n: int = 13,
        window_to_remove: int = 200,
        too_dirty_cutoff: int = 10,
        minimum_slice_length: int = 200,
        delete_chars: str = string.punctuation,
    ) -> None:
119
120
121
122
123
124
125
126
127
128
129
130
131
132
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
        self.minimum_slice_length = minimum_slice_length
        self.delete_chars = delete_chars

        self.dirt_ngrams = set()

        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
        # This is fast by python standards
        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
        self.translation_table = str.maketrans(
            string.ascii_lowercase + string.ascii_uppercase,  # These characters
            string.ascii_lowercase * 2,  # Become these characters
Fabrizio Milo's avatar
Fabrizio Milo committed
133
            self.delete_chars,  # These are deleted
134
135
136
137
138
139
        )

    ##############
    # I/O for saving contamination ngrams
    ##############

Ethan Smith's avatar
Ethan Smith committed
140
    def save_contamination_ngrams(self, filename: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
141
        with open(filename, "wb") as fp:
142
143
            pickle.dump(filename, fp)

Ethan Smith's avatar
Ethan Smith committed
144
    def load_contamination_ngrams(self, filename: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
145
        with open(filename, "rb") as fp:
146
147
148
149
150
151
            self.dirt_ngrams = pickle.load(fp)

    ##############
    # Call these :)
    ##############

Ethan Smith's avatar
Ethan Smith committed
152
    def register_contaminant(self, dirt_string: str) -> None:
153
154
155
156
157
158
159
160
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
            return self.register_contaminant_cpp(dirt_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)

161
    def clean(self, dirty_string: str) -> List[str]:
162
        """Clean a string (e.g. a training set) by removing all ngrams previously
Fabrizio Milo's avatar
Fabrizio Milo committed
163
        registered as contaminants. Returns a list of clean chunks, or empty if
164
165
166
167
168
169
170
        the string was too dirty"""
        if JANITOR_CPP:
            return self.clean_cpp(dirty_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)

Ethan Smith's avatar
Ethan Smith committed
171
    def _split_chunks(
172
173
        self, dirty_string: str, dirty_parts: Sequence[Tuple]
    ) -> List[str]:
174
175
        clean_chunks = []
        splice_idx = 0
researcher2's avatar
researcher2 committed
176
        end = -1
177
        for i, (ngram, start, end) in enumerate(dirty_parts):
researcher2's avatar
researcher2 committed
178
            if i >= self.too_dirty_cutoff:
179
180
181
182
183
                return []
            start = max(0, start - self.window_to_remove)
            end = min(len(dirty_string), end + self.window_to_remove)

            if start - splice_idx > self.minimum_slice_length:
Fabrizio Milo's avatar
Fabrizio Milo committed
184
                clean_chunks.append(dirty_string[splice_idx:start])
185
186
            splice_idx = end

researcher2's avatar
researcher2 committed
187
        if end < len(dirty_string) - self.minimum_slice_length:
Fabrizio Milo's avatar
Fabrizio Milo committed
188
            clean_chunks.append(dirty_string[end + 1 :])
researcher2's avatar
researcher2 committed
189

190
191
192
193
194
195
        return clean_chunks

    ##############
    # Fast C++
    ##############

Ethan Smith's avatar
Ethan Smith committed
196
    def register_contaminant_cpp(self, dirt_string) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
197
198
199
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )
200

201
    def clean_cpp(self, dirty_string: str) -> List[str]:
Fabrizio Milo's avatar
Fabrizio Milo committed
202
203
204
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
205
206
207
208
209
210
        return self._split_chunks(dirty_string, contamination_indices)

    ##############
    # Slow python
    ##############

Ethan Smith's avatar
Ethan Smith committed
211
    def normalize_string(self, s: str) -> str:
212
213
        return s.translate(self.translation_table)

Ethan Smith's avatar
Ethan Smith committed
214
    def register_contaminant_python(self, dirt_string: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
215
216
217
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )
218

219
    def clean_python(self, dirty_string: str) -> List[str]:
220
221
222
223
224
225
226
227
228
229
230
231
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
        )
        return self._split_chunks(dirty_string, contamination_indices)


##################################################################
# Tests
#################################################################

researcher2's avatar
researcher2 committed
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# def print_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2

#     for i in range(1, 10, 2):
#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
#         for ngram, start, end in \
#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))


# def test_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"

#     jan_python = Janitor()
#     jan_cpp = Janitor()

#     jan_python.register_contaminant_python(contaminant)
#     jan_cpp.register_contaminant(contaminant)

#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)

#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
#         (jan_python.clean_python(source), jan_cpp.clean(source))

#     print("Passed test, python==cpp")


# def benchmark():
#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
#     setup = \
#         """
#         with open("data/enwik8", "r") as f:
#             data = f.read()
#         jan = Janitor(too_dirty_cutoff=1000)
#         jan.register_contaminant('''
Fabrizio Milo's avatar
Fabrizio Milo committed
268
#         theories is that there is a connection between &quot;geekdom&quot; and autism.
researcher2's avatar
researcher2 committed
269
#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
Fabrizio Milo's avatar
Fabrizio Milo committed
270
271
#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
researcher2's avatar
researcher2 committed
272
273
#         the media's application of mental disease labels to what is actually variant normal behavior
#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
Fabrizio Milo's avatar
Fabrizio Milo committed
274
#         interests, even when they seem unusual to others, are not in themselves signs of autism or
researcher2's avatar
researcher2 committed
275
276
277
278
279
#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
#         mental disease labels to children who in the past would have simply been accepted as a little
#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
#         Due to the recent publicity surrounding autism and autis
#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
Fabrizio Milo's avatar
Fabrizio Milo committed
280
281
#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
Fabrizio Milo's avatar
Fabrizio Milo committed
282
#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
Fabrizio Milo's avatar
Fabrizio Milo committed
283
284
285
286
287
288
289
290
#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
researcher2's avatar
researcher2 committed
291
292
293
294
295
296
297
298
299
300
301
302
#         ''')
#         """

#     n = 1
#     print(f"Timing {n} run on 100 MB")
#     print("Register contaminant")
#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))

#     print("Clean")
#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
303
304


researcher2's avatar
researcher2 committed
305
306
307
# def test_janitor_general():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"
308

researcher2's avatar
researcher2 committed
309
310
311
312
313
#     jan = Janitor(ngram_n=3)
#     jan.register_contaminant(contaminant)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
314

researcher2's avatar
researcher2 committed
315
316
#     filename = "data/saved_contam"
#     jan.save_contamination_ngrams(filename)
317

researcher2's avatar
researcher2 committed
318
319
320
321
322
#     jan = Janitor(ngram_n=3)
#     jan.load_contamination_ngrams(filename)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
323
324


researcher2's avatar
researcher2 committed
325
326
327
328
329
# if __name__ == "__main__":
#     test()
#     # print_cpp()
#     # test_cpp()
#     # benchmark()