Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
ebc4a408
Commit
ebc4a408
authored
Feb 22, 2021
by
Mostofa Patwary
Browse files
addressed the comments
parent
5a867963
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
29 deletions
+29
-29
tools/openwebtext/find_duplicates.py
tools/openwebtext/find_duplicates.py
+29
-29
No files found.
tools/openwebtext/find_duplicates.py
View file @
ebc4a408
...
@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):
...
@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
print
(
'parsing the
inpu
ts ...'
)
print
(
'parsing the
argumen
ts ...'
)
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
1234
,
help
=
'Random seed used for python, numpy'
)
parser
.
add_argument
(
'--inputs'
,
nargs
=
'*'
,
default
=
None
,
help
=
\
parser
.
add_argument
(
'--inputs'
,
nargs
=
'*'
,
default
=
None
,
help
=
\
'Pairwise list of the input files and keys, '
'Pairwise list of the input files and keys, '
'e.g. --inputs cc.json cc_id news.json news_id'
)
'e.g. --inputs cc.json cc_id news.json news_id'
)
...
@@ -58,7 +60,7 @@ if __name__ == '__main__':
...
@@ -58,7 +60,7 @@ if __name__ == '__main__':
print
(
'finding possible duplicate content ...'
)
print
(
'finding possible duplicate content ...'
)
# set seed and get an array of seeds of 100 integers
# set seed and get an array of seeds of 100 integers
np
.
random
.
seed
(
1234
)
np
.
random
.
seed
(
args
.
seed
)
seeds
=
np
.
random
.
randint
(
0
,
1e6
,
size
=
100
)
seeds
=
np
.
random
.
randint
(
0
,
1e6
,
size
=
100
)
# initialize minhash and lsh cache
# initialize minhash and lsh cache
...
@@ -69,10 +71,7 @@ if __name__ == '__main__':
...
@@ -69,10 +71,7 @@ if __name__ == '__main__':
# load fingerprints from pickle file if needed
# load fingerprints from pickle file if needed
if
args
.
load_fingerprints
is
not
None
:
if
args
.
load_fingerprints
is
not
None
:
count_fingerprints
=
len
(
args
.
load_fingerprints
)
for
count_fp
,
fp_file_name
in
enumerate
(
args
.
load_fingerprints
):
for
count_fp
in
range
(
count_fingerprints
):
fp_file_name
=
args
.
load_fingerprints
[
count_fp
]
print
(
"Loading fingerprints from pickle file {}"
.
format
(
print
(
"Loading fingerprints from pickle file {}"
.
format
(
fp_file_name
),
flush
=
True
)
fp_file_name
),
flush
=
True
)
fp
=
open
(
fp_file_name
,
"rb"
)
fp
=
open
(
fp_file_name
,
"rb"
)
...
@@ -87,6 +86,7 @@ if __name__ == '__main__':
...
@@ -87,6 +86,7 @@ if __name__ == '__main__':
for
url
in
local_lshcache
.
fingerprints
.
keys
():
for
url
in
local_lshcache
.
fingerprints
.
keys
():
url_doc
[
url
]
=
local_url_doc
[
url
]
url_doc
[
url
]
=
local_url_doc
[
url
]
lshcache
.
add_fingerprint
(
local_lshcache
.
fingerprints
[
url
],
url
)
lshcache
.
add_fingerprint
(
local_lshcache
.
fingerprints
[
url
],
url
)
fp
.
close
()
counter
=
0
counter
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
...
@@ -94,29 +94,28 @@ if __name__ == '__main__':
...
@@ -94,29 +94,28 @@ if __name__ == '__main__':
print
(
"Computing fingerprints"
,
flush
=
True
)
print
(
"Computing fingerprints"
,
flush
=
True
)
# compute finger prints of the inputs if any
# compute finger prints of the inputs if any
input_pairs
=
0
if
args
.
inputs
is
None
else
int
(
len
(
args
.
inputs
)
/
2
)
# input file and the key to use as id
for
input_pair
in
range
(
input_pairs
):
if
args
.
inputs
is
not
None
:
# input file and the key to use as id
assert
len
(
args
.
inputs
)
%
2
==
0
input_file
=
args
.
inputs
[
2
*
input_pair
]
for
input_file
,
key
in
zip
(
args
.
inputs
[::
2
],
args
.
inputs
[
1
::
2
]):
key
=
args
.
inputs
[
2
*
input_pair
+
1
]
print
(
' document processing {} with key {}'
.
format
(
input_file
,
key
),
print
(
' document processing {} with key {}'
.
format
(
input_file
,
key
),
flush
=
True
)
flush
=
True
)
# traverse all the texts and add fingerprints
# traverse all the texts and add fingerprints
with
open
(
input_file
,
'r'
)
as
f_input
:
with
open
(
input_file
,
'r'
)
as
f_input
:
for
line
in
f_input
:
for
line
in
f_input
:
try
:
try
:
myjson
=
json
.
loads
(
line
)
myjson
=
json
.
loads
(
line
)
url
=
myjson
[
key
]
url
=
myjson
[
key
]
text
=
myjson
[
'text'
]
text
=
myjson
[
'text'
]
counter
+=
1
counter
+=
1
url_doc
[
url
]
=
text
url_doc
[
url
]
=
text
lshcache
.
add_fingerprint
(
hasher
.
fingerprint
(
text
),
url
)
lshcache
.
add_fingerprint
(
hasher
.
fingerprint
(
text
),
url
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'Error:'
,
e
)
print
(
'Error:'
,
e
)
if
counter
%
10000
==
0
:
if
counter
%
10000
==
0
:
print
(
' [read]> processed {} documents in {:.2f} '
print
(
' [read]> processed {} documents in {:.2f} '
'seconds ...'
.
format
(
counter
,
time
.
time
()
-
\
'seconds ...'
.
format
(
counter
,
time
.
time
()
-
\
start_time
),
flush
=
True
)
start_time
),
flush
=
True
)
# Save the fingerprints if needed
# Save the fingerprints if needed
if
args
.
save_fingerprints
is
not
None
:
if
args
.
save_fingerprints
is
not
None
:
...
@@ -160,5 +159,6 @@ if __name__ == '__main__':
...
@@ -160,5 +159,6 @@ if __name__ == '__main__':
ensure_ascii
=
False
)
ensure_ascii
=
False
)
f_out
.
write
(
myjson
.
encode
(
'utf-8'
))
f_out
.
write
(
myjson
.
encode
(
'utf-8'
))
f_out
.
write
(
'
\n
'
.
encode
(
'utf-8'
))
f_out
.
write
(
'
\n
'
.
encode
(
'utf-8'
))
f_out
.
close
()
print
(
'done :-)'
)
print
(
'done :-)'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment