addressed the comments

ebc4a408 · Mostofa Patwary · 5a867963 · ebc4a408
Commit ebc4a408 authored Feb 22, 2021 by Mostofa Patwary
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 29 deletions

tools/openwebtext/find_duplicates.py tools/openwebtext/find_duplicates.py +29 -29

No files found.
--- a/tools/openwebtext/find_duplicates.py
+++ b/tools/openwebtext/find_duplicates.py
@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):

 if __name__ == '__main__':

-    print('parsing the inputs ...')
+    print('parsing the arguments ...')

    parser = argparse.ArgumentParser()
+    parser.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy')
    parser.add_argument('--inputs', nargs = '*', default=None, help = \
                        'Pairwise list of the input files and keys, '
                        'e.g. --inputs cc.json cc_id news.json news_id')
@@ -58,7 +60,7 @@ if __name__ == '__main__':
    print('finding possible duplicate content ...')

    # set seed and get an array of seeds of 100 integers
-    np.random.seed(1234)
+    np.random.seed(args.seed)
    seeds = np.random.randint(0, 1e6, size=100)

    # initialize minhash and lsh cache
@@ -69,10 +71,7 @@ if __name__ == '__main__':

    # load fingerprints from pickle file if needed
    if args.load_fingerprints is not None:
-        count_fingerprints = len(args.load_fingerprints)
-
-        for count_fp in range(count_fingerprints):
-            fp_file_name = args.load_fingerprints[count_fp]
+        for count_fp, fp_file_name in enumerate(args.load_fingerprints):
            print("Loading fingerprints from pickle file {}".format(
                fp_file_name), flush=True)
            fp = open(fp_file_name, "rb")
@@ -87,6 +86,7 @@ if __name__ == '__main__':
                for url in local_lshcache.fingerprints.keys():
                    url_doc[url] = local_url_doc[url]
                    lshcache.add_fingerprint(local_lshcache.fingerprints[url], url)
+            fp.close()

    counter = 0
    start_time = time.time()
@@ -94,29 +94,28 @@ if __name__ == '__main__':
    print("Computing fingerprints", flush=True)

    # compute finger prints of the inputs if any
-    input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2)
-    for input_pair in range(input_pairs):
-        # input file and the key to use as id
-        input_file = args.inputs[2 * input_pair]
-        key = args.inputs[2 * input_pair + 1]
-        print(' document processing {} with key {}'.format(input_file, key),
-            flush=True)
-        # traverse all the texts and add fingerprints
-        with open(input_file, 'r') as f_input:
-            for line in f_input:
-                try:
-                    myjson = json.loads(line)
-                    url = myjson[key]
-                    text = myjson['text']
-                    counter += 1
-                    url_doc[url] = text
-                    lshcache.add_fingerprint(hasher.fingerprint(text), url)
-                except Exception as e:
-                    print('Error:', e)
-                if counter % 10000 == 0:
-                    print(' [read]> processed {} documents in {:.2f} '
-                        'seconds ...'.format(counter, time.time() - \
-                        start_time), flush=True)
+    # input file and the key to use as id
+    if args.inputs is not None:
+        assert len(args.inputs) % 2 == 0
+        for input_file, key in zip(args.inputs[::2], args.inputs[1::2]):
+            print(' document processing {} with key {}'.format(input_file, key),
+                flush=True)
+            # traverse all the texts and add fingerprints
+            with open(input_file, 'r') as f_input:
+                for line in f_input:
+                    try:
+                        myjson = json.loads(line)
+                        url = myjson[key]
+                        text = myjson['text']
+                        counter += 1
+                        url_doc[url] = text
+                        lshcache.add_fingerprint(hasher.fingerprint(text), url)
+                    except Exception as e:
+                        print('Error:', e)
+                    if counter % 10000 == 0:
+                        print(' [read]> processed {} documents in {:.2f} '
+                            'seconds ...'.format(counter, time.time() - \
+                            start_time), flush=True)

    # Save the fingerprints if needed
    if args.save_fingerprints is not None:
@@ -160,5 +159,6 @@ if __name__ == '__main__':
                                            ensure_ascii=False)
                        f_out.write(myjson.encode('utf-8'))
                        f_out.write('\n'.encode('utf-8'))
+        f_out.close()

    print('done :-)')