Commit ebc4a408 authored by Mostofa Patwary's avatar Mostofa Patwary
Browse files

addressed the comments

parent 5a867963
...@@ -39,9 +39,11 @@ def jaccard(set_a, set_b): ...@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):
if __name__ == '__main__': if __name__ == '__main__':
print('parsing the inputs ...') print('parsing the arguments ...')
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=1234,
help='Random seed used for python, numpy')
parser.add_argument('--inputs', nargs = '*', default=None, help = \ parser.add_argument('--inputs', nargs = '*', default=None, help = \
'Pairwise list of the input files and keys, ' 'Pairwise list of the input files and keys, '
'e.g. --inputs cc.json cc_id news.json news_id') 'e.g. --inputs cc.json cc_id news.json news_id')
...@@ -58,7 +60,7 @@ if __name__ == '__main__': ...@@ -58,7 +60,7 @@ if __name__ == '__main__':
print('finding possible duplicate content ...') print('finding possible duplicate content ...')
# set seed and get an array of seeds of 100 integers # set seed and get an array of seeds of 100 integers
np.random.seed(1234) np.random.seed(args.seed)
seeds = np.random.randint(0, 1e6, size=100) seeds = np.random.randint(0, 1e6, size=100)
# initialize minhash and lsh cache # initialize minhash and lsh cache
...@@ -69,10 +71,7 @@ if __name__ == '__main__': ...@@ -69,10 +71,7 @@ if __name__ == '__main__':
# load fingerprints from pickle file if needed # load fingerprints from pickle file if needed
if args.load_fingerprints is not None: if args.load_fingerprints is not None:
count_fingerprints = len(args.load_fingerprints) for count_fp, fp_file_name in enumerate(args.load_fingerprints):
for count_fp in range(count_fingerprints):
fp_file_name = args.load_fingerprints[count_fp]
print("Loading fingerprints from pickle file {}".format( print("Loading fingerprints from pickle file {}".format(
fp_file_name), flush=True) fp_file_name), flush=True)
fp = open(fp_file_name, "rb") fp = open(fp_file_name, "rb")
...@@ -87,6 +86,7 @@ if __name__ == '__main__': ...@@ -87,6 +86,7 @@ if __name__ == '__main__':
for url in local_lshcache.fingerprints.keys(): for url in local_lshcache.fingerprints.keys():
url_doc[url] = local_url_doc[url] url_doc[url] = local_url_doc[url]
lshcache.add_fingerprint(local_lshcache.fingerprints[url], url) lshcache.add_fingerprint(local_lshcache.fingerprints[url], url)
fp.close()
counter = 0 counter = 0
start_time = time.time() start_time = time.time()
...@@ -94,29 +94,28 @@ if __name__ == '__main__': ...@@ -94,29 +94,28 @@ if __name__ == '__main__':
print("Computing fingerprints", flush=True) print("Computing fingerprints", flush=True)
# compute finger prints of the inputs if any # compute finger prints of the inputs if any
input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2) # input file and the key to use as id
for input_pair in range(input_pairs): if args.inputs is not None:
# input file and the key to use as id assert len(args.inputs) % 2 == 0
input_file = args.inputs[2 * input_pair] for input_file, key in zip(args.inputs[::2], args.inputs[1::2]):
key = args.inputs[2 * input_pair + 1] print(' document processing {} with key {}'.format(input_file, key),
print(' document processing {} with key {}'.format(input_file, key), flush=True)
flush=True) # traverse all the texts and add fingerprints
# traverse all the texts and add fingerprints with open(input_file, 'r') as f_input:
with open(input_file, 'r') as f_input: for line in f_input:
for line in f_input: try:
try: myjson = json.loads(line)
myjson = json.loads(line) url = myjson[key]
url = myjson[key] text = myjson['text']
text = myjson['text'] counter += 1
counter += 1 url_doc[url] = text
url_doc[url] = text lshcache.add_fingerprint(hasher.fingerprint(text), url)
lshcache.add_fingerprint(hasher.fingerprint(text), url) except Exception as e:
except Exception as e: print('Error:', e)
print('Error:', e) if counter % 10000 == 0:
if counter % 10000 == 0: print(' [read]> processed {} documents in {:.2f} '
print(' [read]> processed {} documents in {:.2f} ' 'seconds ...'.format(counter, time.time() - \
'seconds ...'.format(counter, time.time() - \ start_time), flush=True)
start_time), flush=True)
# Save the fingerprints if needed # Save the fingerprints if needed
if args.save_fingerprints is not None: if args.save_fingerprints is not None:
...@@ -160,5 +159,6 @@ if __name__ == '__main__': ...@@ -160,5 +159,6 @@ if __name__ == '__main__':
ensure_ascii=False) ensure_ascii=False)
f_out.write(myjson.encode('utf-8')) f_out.write(myjson.encode('utf-8'))
f_out.write('\n'.encode('utf-8')) f_out.write('\n'.encode('utf-8'))
f_out.close()
print('done :-)') print('done :-)')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment