janitor.py 12.9 KB
Newer Older
1
2
3
4
5
import re
import string
import pickle
import traceback
from pprint import pprint
6
from typing import Iterator, Sequence, TypeVar, List, Tuple
7
8
9
10
11

# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
try:
    import janitor_util
Fabrizio Milo's avatar
Fabrizio Milo committed
12

13
    JANITOR_CPP = True
14
except Exception:
15
16
17
18
    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
    traceback.print_exc()
    JANITOR_CPP = False

Ethan Smith's avatar
Ethan Smith committed
19
T = TypeVar("T")
20

Ethan Smith's avatar
Ethan Smith committed
21

22
23
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
24
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
        try:
            next_item = next(sequence)
        except StopIteration:
            # no more data, terminate the generator
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]


Ethan Smith's avatar
Ethan Smith committed
41
def word_ngrams(s: str, n: int) -> Iterator[str]:
42
43
44
45
46
    """Splits a string into ngram words"""
    tokens = s.split()  # not a generator :(
    ngram_seqs = form_ngrams(iter(tokens), n)
    return (" ".join(ngram) for ngram in ngram_seqs)

Fabrizio Milo's avatar
Fabrizio Milo committed
47

researcher2's avatar
researcher2 committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Does character sequences only - combined faster function to play around with later
# def word_ngrams_indices_combined(sequence, n):
#     current_word = ""
#     history = []
#     gap = False;
#     start = 0
#     end = 0
#     for character in sequence:
#         if character == " ":
#             if not gap:
#                 gap = True
#                 history.append(current_word)
#                 end += len(current_word) - 1
#                 current_word = ""
#                 if len(history) == n:
#                     yield (tuple(history), start, end)
#                     del history[0]
#                     start = end + 1
#                     end = start
#         else:
#             gap = False
#             current_word += character

71
72

# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
73
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
74
75
76
    """Splits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    """
Fabrizio Milo's avatar
Fabrizio Milo committed
77
    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
78
79


80
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
    """Splits a string into pairs of (ngram words, their start/end indices)"""
    tokens_with_indices = split_indices(s)

    # Generator of ngrams of (word, idx_pairs)
    # (
    #   [(word, (start,end)), (word, (start, end))...],
    #   [(word, (start, end)), ...],
    #   ...
    # )
    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)

    # Generator of pairs of word and index ngrams
    # (
    #   ([word, word, ...], [(start,end), (start,end), ...]),
    #   ...
    # )
Fabrizio Milo's avatar
Fabrizio Milo committed
97
98
99
    ngram_indices_pairs = (
        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
    )
100
101

    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
Fabrizio Milo's avatar
Fabrizio Milo committed
102
103
104
105
    return (
        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
        for ngram_seq, indices in ngram_indices_pairs
    )
106
107
108
109
110


class Janitor:
    # FIXME delete_chars: Should anything else go here? Special chars?
    def __init__(
Fabrizio Milo's avatar
Fabrizio Milo committed
111
        self,
Ethan Smith's avatar
Ethan Smith committed
112
113
114
115
116
117
        ngram_n: int = 13,
        window_to_remove: int = 200,
        too_dirty_cutoff: int = 10,
        minimum_slice_length: int = 200,
        delete_chars: str = string.punctuation,
    ) -> None:
118
119
120
121
122
123
124
125
126
127
128
129
130
131
        self.ngram_n = ngram_n
        self.window_to_remove = window_to_remove
        self.too_dirty_cutoff = too_dirty_cutoff
        self.minimum_slice_length = minimum_slice_length
        self.delete_chars = delete_chars

        self.dirt_ngrams = set()

        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
        # This is fast by python standards
        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
        self.translation_table = str.maketrans(
            string.ascii_lowercase + string.ascii_uppercase,  # These characters
            string.ascii_lowercase * 2,  # Become these characters
Fabrizio Milo's avatar
Fabrizio Milo committed
132
            self.delete_chars,  # These are deleted
133
134
135
136
137
138
        )

    ##############
    # I/O for saving contamination ngrams
    ##############

Ethan Smith's avatar
Ethan Smith committed
139
    def save_contamination_ngrams(self, filename: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
140
        with open(filename, "wb") as fp:
141
142
            pickle.dump(filename, fp)

Ethan Smith's avatar
Ethan Smith committed
143
    def load_contamination_ngrams(self, filename: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
144
        with open(filename, "rb") as fp:
145
146
147
148
149
150
            self.dirt_ngrams = pickle.load(fp)

    ##############
    # Call these :)
    ##############

Ethan Smith's avatar
Ethan Smith committed
151
    def register_contaminant(self, dirt_string: str) -> None:
152
153
154
155
156
157
158
159
        """Register a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning"""
        if JANITOR_CPP:
            return self.register_contaminant_cpp(dirt_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.register_contaminant_python(dirt_string)

160
    def clean(self, dirty_string: str) -> List[str]:
161
        """Clean a string (e.g. a training set) by removing all ngrams previously
Fabrizio Milo's avatar
Fabrizio Milo committed
162
        registered as contaminants. Returns a list of clean chunks, or empty if
163
164
165
166
167
168
169
        the string was too dirty"""
        if JANITOR_CPP:
            return self.clean_cpp(dirty_string)
        else:
            print("WARNING: Janitor running in python mode")
            return self.clean_python(dirty_string)

Ethan Smith's avatar
Ethan Smith committed
170
    def _split_chunks(
171
172
        self, dirty_string: str, dirty_parts: Sequence[Tuple]
    ) -> List[str]:
173
174
        clean_chunks = []
        splice_idx = 0
researcher2's avatar
researcher2 committed
175
        end = -1
176
        for i, (ngram, start, end) in enumerate(dirty_parts):
researcher2's avatar
researcher2 committed
177
            if i >= self.too_dirty_cutoff:
178
179
180
181
182
                return []
            start = max(0, start - self.window_to_remove)
            end = min(len(dirty_string), end + self.window_to_remove)

            if start - splice_idx > self.minimum_slice_length:
Fabrizio Milo's avatar
Fabrizio Milo committed
183
                clean_chunks.append(dirty_string[splice_idx:start])
184
185
            splice_idx = end

researcher2's avatar
researcher2 committed
186
        if end < len(dirty_string) - self.minimum_slice_length:
Fabrizio Milo's avatar
Fabrizio Milo committed
187
            clean_chunks.append(dirty_string[end + 1 :])
researcher2's avatar
researcher2 committed
188

189
190
191
192
193
194
        return clean_chunks

    ##############
    # Fast C++
    ##############

Ethan Smith's avatar
Ethan Smith committed
195
    def register_contaminant_cpp(self, dirt_string) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
196
197
198
        self.dirt_ngrams.update(
            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
        )
199

200
    def clean_cpp(self, dirty_string: str) -> List[str]:
Fabrizio Milo's avatar
Fabrizio Milo committed
201
202
203
        contamination_indices = janitor_util.clean_ngram_with_indices(
            dirty_string, self.delete_chars, self.ngram_n
        )
204
205
206
207
208
209
        return self._split_chunks(dirty_string, contamination_indices)

    ##############
    # Slow python
    ##############

Ethan Smith's avatar
Ethan Smith committed
210
    def normalize_string(self, s: str) -> str:
211
212
        return s.translate(self.translation_table)

Ethan Smith's avatar
Ethan Smith committed
213
    def register_contaminant_python(self, dirt_string: str) -> None:
Fabrizio Milo's avatar
Fabrizio Milo committed
214
215
216
        self.dirt_ngrams.update(
            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
        )
217

218
    def clean_python(self, dirty_string: str) -> List[str]:
219
220
221
222
223
224
225
226
227
228
229
230
        contamination_indices = (
            (None, *idx_pair)
            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
        )
        return self._split_chunks(dirty_string, contamination_indices)


##################################################################
# Tests
#################################################################

researcher2's avatar
researcher2 committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# def print_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2

#     for i in range(1, 10, 2):
#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
#         for ngram, start, end in \
#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))


# def test_cpp():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"

#     jan_python = Janitor()
#     jan_cpp = Janitor()

#     jan_python.register_contaminant_python(contaminant)
#     jan_cpp.register_contaminant(contaminant)

#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)

#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
#         (jan_python.clean_python(source), jan_cpp.clean(source))

#     print("Passed test, python==cpp")


# def benchmark():
#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
#     setup = \
#         """
#         with open("data/enwik8", "r") as f:
#             data = f.read()
#         jan = Janitor(too_dirty_cutoff=1000)
#         jan.register_contaminant('''
Fabrizio Milo's avatar
Fabrizio Milo committed
267
#         theories is that there is a connection between &quot;geekdom&quot; and autism.
researcher2's avatar
researcher2 committed
268
#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
Fabrizio Milo's avatar
Fabrizio Milo committed
269
270
#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
researcher2's avatar
researcher2 committed
271
272
#         the media's application of mental disease labels to what is actually variant normal behavior
#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
Fabrizio Milo's avatar
Fabrizio Milo committed
273
#         interests, even when they seem unusual to others, are not in themselves signs of autism or
researcher2's avatar
researcher2 committed
274
275
276
277
278
#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
#         mental disease labels to children who in the past would have simply been accepted as a little
#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
#         Due to the recent publicity surrounding autism and autis
#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
Fabrizio Milo's avatar
Fabrizio Milo committed
279
280
#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
Fabrizio Milo's avatar
Fabrizio Milo committed
281
#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
Fabrizio Milo's avatar
Fabrizio Milo committed
282
283
284
285
286
287
288
289
#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
researcher2's avatar
researcher2 committed
290
291
292
293
294
295
296
297
298
299
300
301
#         ''')
#         """

#     n = 1
#     print(f"Timing {n} run on 100 MB")
#     print("Register contaminant")
#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))

#     print("Clean")
#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
302
303


researcher2's avatar
researcher2 committed
304
305
306
# def test_janitor_general():
#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
#     contaminant = "dirty boy. Clean he he"
307

researcher2's avatar
researcher2 committed
308
309
310
311
312
#     jan = Janitor(ngram_n=3)
#     jan.register_contaminant(contaminant)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
313

researcher2's avatar
researcher2 committed
314
315
#     filename = "data/saved_contam"
#     jan.save_contamination_ngrams(filename)
316

researcher2's avatar
researcher2 committed
317
318
319
320
321
#     jan = Janitor(ngram_n=3)
#     jan.load_contamination_ngrams(filename)
#     cleaned = " ".join(jan.clean(source))
#     for contam in jan.dirt_ngrams:
#         assert contam not in cleaned, contam
322
323


researcher2's avatar
researcher2 committed
324
325
326
327
328
# if __name__ == "__main__":
#     test()
#     # print_cpp()
#     # test_cpp()
#     # benchmark()