find_duplicates.py 5.35 KB
Newer Older
1
# coding=utf-8
Mohammad's avatar
Mohammad committed
2
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
import argparse
17
18
19
20
import itertools
import json
from lsh import cache, minhash
import time
21
import pickle
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import sys


# This function is adapted from:
#   https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
def shingles(text, char_ngram=5):
    return set(text[head:head + char_ngram]
               for head in range(0, len(text) - char_ngram))


# This function is adapted from:
#  https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
def jaccard(set_a, set_b):
    intersection = set_a & set_b
    union = set_a | set_b
    return len(intersection) / len(union)


if __name__ == '__main__':

42
43
44
    print('parsing the inputs ...')

    parser = argparse.ArgumentParser()
Mostofa Patwary's avatar
Mostofa Patwary committed
45
46
47
    parser.add_argument('--inputs', nargs = '*', default=None, help = 'Pairwise'
                        ' list of the input files and keys, e.g. --inputs '
                        ' cc.json cc_id news.json news_id')
48
49
50
51
52
53
54
    parser.add_argument('--load-fingerprints', type=str, default=None,
                       help='Load the fingerprints from pickle file.')
    parser.add_argument('--save-fingerprints', type=str, default=None,
                       help='Save the fingerprints of the inputs.')
    parser.add_argument('--output', type=str,
                       help='Output file name.')
    args = parser.parse_args()
55

56
    print('finding possible duplicate content ...')
57
58
59
60
61

    hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)
    lshcache = cache.Cache(bands=10, hasher=hasher)

    url_doc = {}
62
63
64
65
66
67
68
69
70
71

    # load fingerprints from pickle file if needed
    if args.load_fingerprints is not None:
        print("Loading fingerprints from pickle file {}".format(
            args.load_fingerprints), flush=True)
        with open(args.load_fingerprints, "rb") as f:
            lshcache = pickle.load(f)
            url_doc = pickle.load(f)

    counter = 0
72
    start_time = time.time()
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

    print("Computing fingerprints", flush=True)

    input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2)
    for i in range(input_pairs):
        input_file = args.inputs[2 * i]
        key = args.inputs[2 * i + 1]
        print(' document processing {} with key {}'.format(input_file, key),
            flush=True)
        with open(input_file, 'r') as f:
            for line in f:
                try:
                    myjson = json.loads(line)
                    url = myjson[key]
                    text = myjson['text']
                    counter += 1
                    url_doc[url] = text
                    lshcache.add_fingerprint(hasher.fingerprint(text), url)
                except Exception as e:
                    print('Error:', e)
                if counter % 10000 == 0:
                    print(' [read]> processed {} documents in {:.2f} '
                        'seconds ...'.format(counter, time.time() - \
                        start_time), flush=True)

    # Save the fingerprints if needed
    if args.save_fingerprints is not None:
        print("Saving fingerprints to pickle file {}".format(
            args.save_fingerprints), flush=True)
        with open(args.save_fingerprints, 'wb') as f:
            pickle.dump(lshcache, f)
            pickle.dump(url_doc, f)
105
106
107
108

    counter = 0
    start_time = time.time()
    deduped = 0
109
    with open(args.output, 'wb') as f:
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
        for b in lshcache.bins:
            for bucket_id in b:
                if len(b[bucket_id]) > 1:
                    items = list(b[bucket_id])
                    main_url = items[0]
                    main_dhingles = shingles(url_doc[main_url])
                    remove_urls = []
                    for i in range(1, len(items)):
                        counter += 1
                        other_url= items[i]
                        other_shingles = shingles(url_doc[other_url])
                        try:
                            jaccard_sim = jaccard(main_dhingles, other_shingles)
                        except Exception as e:
                            print('Error:', e)
                        if jaccard_sim > 0.5:
                            remove_urls.append({other_url: jaccard_sim})
                            deduped += 1
                        if counter % 10000 == 0:
                            print(' [write]> processed {} documents in {:.2f} '
                                  'seoncds and deduped {} documents ...'.
                                  format(counter, time.time() - start_time,
                                         deduped), flush=True)
                    if len(remove_urls) > 0:
                        myjson = json.dumps({main_url: remove_urls},
                                            ensure_ascii=False)
                        f.write(myjson.encode('utf-8'))
                        f.write('\n'.encode('utf-8'))

    print('done :-)')