"server/vscode:/vscode.git/clone" did not exist on "1da642bd0e6de28ef499f17cd226264f3ccdc824"
Commit ebc4a408 authored by Mostofa Patwary's avatar Mostofa Patwary
Browse files

addressed the comments

parent 5a867963
......@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):
if __name__ == '__main__':
print('parsing the inputs ...')
print('parsing the arguments ...')
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=1234,
help='Random seed used for python, numpy')
parser.add_argument('--inputs', nargs = '*', default=None, help = \
'Pairwise list of the input files and keys, '
'e.g. --inputs cc.json cc_id news.json news_id')
......@@ -58,7 +60,7 @@ if __name__ == '__main__':
print('finding possible duplicate content ...')
# set seed and get an array of seeds of 100 integers
np.random.seed(1234)
np.random.seed(args.seed)
seeds = np.random.randint(0, 1e6, size=100)
# initialize minhash and lsh cache
......@@ -69,10 +71,7 @@ if __name__ == '__main__':
# load fingerprints from pickle file if needed
if args.load_fingerprints is not None:
count_fingerprints = len(args.load_fingerprints)
for count_fp in range(count_fingerprints):
fp_file_name = args.load_fingerprints[count_fp]
for count_fp, fp_file_name in enumerate(args.load_fingerprints):
print("Loading fingerprints from pickle file {}".format(
fp_file_name), flush=True)
fp = open(fp_file_name, "rb")
......@@ -87,6 +86,7 @@ if __name__ == '__main__':
for url in local_lshcache.fingerprints.keys():
url_doc[url] = local_url_doc[url]
lshcache.add_fingerprint(local_lshcache.fingerprints[url], url)
fp.close()
counter = 0
start_time = time.time()
......@@ -94,29 +94,28 @@ if __name__ == '__main__':
print("Computing fingerprints", flush=True)
# compute finger prints of the inputs if any
input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2)
for input_pair in range(input_pairs):
# input file and the key to use as id
input_file = args.inputs[2 * input_pair]
key = args.inputs[2 * input_pair + 1]
print(' document processing {} with key {}'.format(input_file, key),
flush=True)
# traverse all the texts and add fingerprints
with open(input_file, 'r') as f_input:
for line in f_input:
try:
myjson = json.loads(line)
url = myjson[key]
text = myjson['text']
counter += 1
url_doc[url] = text
lshcache.add_fingerprint(hasher.fingerprint(text), url)
except Exception as e:
print('Error:', e)
if counter % 10000 == 0:
print(' [read]> processed {} documents in {:.2f} '
'seconds ...'.format(counter, time.time() - \
start_time), flush=True)
# input file and the key to use as id
if args.inputs is not None:
assert len(args.inputs) % 2 == 0
for input_file, key in zip(args.inputs[::2], args.inputs[1::2]):
print(' document processing {} with key {}'.format(input_file, key),
flush=True)
# traverse all the texts and add fingerprints
with open(input_file, 'r') as f_input:
for line in f_input:
try:
myjson = json.loads(line)
url = myjson[key]
text = myjson['text']
counter += 1
url_doc[url] = text
lshcache.add_fingerprint(hasher.fingerprint(text), url)
except Exception as e:
print('Error:', e)
if counter % 10000 == 0:
print(' [read]> processed {} documents in {:.2f} '
'seconds ...'.format(counter, time.time() - \
start_time), flush=True)
# Save the fingerprints if needed
if args.save_fingerprints is not None:
......@@ -160,5 +159,6 @@ if __name__ == '__main__':
ensure_ascii=False)
f_out.write(myjson.encode('utf-8'))
f_out.write('\n'.encode('utf-8'))
f_out.close()
print('done :-)')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment