Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
2f6e46bd
Commit
2f6e46bd
authored
Feb 18, 2021
by
Mostofa Patwary
Browse files
added storing and loading of pickle file of hash
parent
a449d312
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
20 deletions
+58
-20
tools/openwebtext/find_duplicates.py
tools/openwebtext/find_duplicates.py
+58
-20
No files found.
tools/openwebtext/find_duplicates.py
View file @
2f6e46bd
...
@@ -13,11 +13,12 @@
...
@@ -13,11 +13,12 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
argparse
import
itertools
import
itertools
import
json
import
json
from
lsh
import
cache
,
minhash
from
lsh
import
cache
,
minhash
import
time
import
time
import
pickle
import
sys
import
sys
...
@@ -38,22 +39,50 @@ def jaccard(set_a, set_b):
...
@@ -38,22 +39,50 @@ def jaccard(set_a, set_b):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
print
(
'finding possible duplicate content ...'
)
print
(
'parsing the inputs ...'
)
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--inputs'
,
nargs
=
'*'
,
default
=
None
,
help
=
'List of '
'the input files'
)
parser
.
add_argument
(
'--load-fingerprints'
,
type
=
str
,
default
=
None
,
help
=
'Load the fingerprints from pickle file.'
)
parser
.
add_argument
(
'--save-fingerprints'
,
type
=
str
,
default
=
None
,
help
=
'Save the fingerprints of the inputs.'
)
parser
.
add_argument
(
'--output'
,
type
=
str
,
help
=
'Output file name.'
)
args
=
parser
.
parse_args
()
input
=
sys
.
argv
[
1
]
print
(
'finding possible duplicate content ...'
)
output
=
sys
.
argv
[
2
]
hasher
=
minhash
.
MinHasher
(
seeds
=
100
,
char_ngram
=
5
,
hashbytes
=
4
)
hasher
=
minhash
.
MinHasher
(
seeds
=
100
,
char_ngram
=
5
,
hashbytes
=
4
)
lshcache
=
cache
.
Cache
(
bands
=
10
,
hasher
=
hasher
)
lshcache
=
cache
.
Cache
(
bands
=
10
,
hasher
=
hasher
)
counter
=
0
url_doc
=
{}
url_doc
=
{}
# load fingerprints from pickle file if needed
if
args
.
load_fingerprints
is
not
None
:
print
(
"Loading fingerprints from pickle file {}"
.
format
(
args
.
load_fingerprints
),
flush
=
True
)
with
open
(
args
.
load_fingerprints
,
"rb"
)
as
f
:
lshcache
=
pickle
.
load
(
f
)
url_doc
=
pickle
.
load
(
f
)
counter
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
with
open
(
input
,
'r'
)
as
f
:
print
(
"Computing fingerprints"
,
flush
=
True
)
input_pairs
=
0
if
args
.
inputs
is
None
else
int
(
len
(
args
.
inputs
)
/
2
)
for
i
in
range
(
input_pairs
):
input_file
=
args
.
inputs
[
2
*
i
]
key
=
args
.
inputs
[
2
*
i
+
1
]
print
(
' document processing {} with key {}'
.
format
(
input_file
,
key
),
flush
=
True
)
with
open
(
input_file
,
'r'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
try
:
try
:
myjson
=
json
.
loads
(
line
)
myjson
=
json
.
loads
(
line
)
url
=
myjson
[
'url'
]
url
=
myjson
[
key
]
text
=
myjson
[
'text'
]
text
=
myjson
[
'text'
]
counter
+=
1
counter
+=
1
url_doc
[
url
]
=
text
url_doc
[
url
]
=
text
...
@@ -61,13 +90,22 @@ if __name__ == '__main__':
...
@@ -61,13 +90,22 @@ if __name__ == '__main__':
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'Error:'
,
e
)
print
(
'Error:'
,
e
)
if
counter
%
10000
==
0
:
if
counter
%
10000
==
0
:
print
(
' [read]> processed {} documents in {:.2f} seconds ...'
.
print
(
' [read]> processed {} documents in {:.2f} '
format
(
counter
,
time
.
time
()
-
start_time
),
flush
=
True
)
'seconds ...'
.
format
(
counter
,
time
.
time
()
-
\
start_time
),
flush
=
True
)
# Save the fingerprints if needed
if
args
.
save_fingerprints
is
not
None
:
print
(
"Saving fingerprints to pickle file {}"
.
format
(
args
.
save_fingerprints
),
flush
=
True
)
with
open
(
args
.
save_fingerprints
,
'wb'
)
as
f
:
pickle
.
dump
(
lshcache
,
f
)
pickle
.
dump
(
url_doc
,
f
)
counter
=
0
counter
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
deduped
=
0
deduped
=
0
with
open
(
output
,
'wb'
)
as
f
:
with
open
(
args
.
output
,
'wb'
)
as
f
:
for
b
in
lshcache
.
bins
:
for
b
in
lshcache
.
bins
:
for
bucket_id
in
b
:
for
bucket_id
in
b
:
if
len
(
b
[
bucket_id
])
>
1
:
if
len
(
b
[
bucket_id
])
>
1
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment