Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
9c2eb971
Commit
9c2eb971
authored
Mar 05, 2021
by
Mostofa Patwary
Browse files
Fixed a bug
parent
b35e15f2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
16 deletions
+16
-16
tools/openwebtext/filter_ngrams.py
tools/openwebtext/filter_ngrams.py
+16
-16
No files found.
tools/openwebtext/filter_ngrams.py
View file @
9c2eb971
...
...
@@ -66,7 +66,7 @@ def check_and_clean_text(args, words, ngrams, text, start_position, \
seq
=
" "
.
join
(
words
)
if
seq
in
ngrams
:
#
print(" [matched]: {}".format(seq), flush=True)
print
(
" [matched]: {}"
.
format
(
seq
),
flush
=
True
)
if
args
.
get_ngram_freq_only
:
# increase freq of this seq and then only consider the later part
...
...
@@ -296,7 +296,7 @@ def compute_ngram_freq_sorted(args, ngrams):
ngrams_freq_sorted
)
-
1
][
0
]),
flush
=
True
)
return
ngrams_freq_sorted
def
get_ngrams_
above
_threshold
(
args
,
ngrams
,
ngrams_
above
_threshold
,
\
def
get_ngrams_
below
_threshold
(
args
,
ngrams
,
ngrams_
below
_threshold
,
\
dedup_file
,
dedup_key
,
ngrams_freq_sorted
):
start_time
=
time
.
time
()
...
...
@@ -329,17 +329,17 @@ def get_ngrams_above_threshold(args, ngrams, ngrams_above_threshold, \
start_time
=
time
.
time
()
counter_threshold
=
0
# Get ngram
above
theadhold
# Get ngram
below
theadhold
for
local_key
,
local_val
in
ngrams
.
items
():
if
ngrams
[
local_key
]
>
args
.
key_threshold
:
if
ngrams
[
local_key
]
<
args
.
key_threshold
:
print
(
" [threshold] {} {}"
.
format
(
local_key
,
local_val
),
flush
=
True
)
counter_threshold
+=
1
ngrams_
above
_threshold
[
local_key
]
=
1
ngrams_
below
_threshold
[
local_key
]
=
1
print
(
' Ngrams
above
threshold {}'
.
format
(
counter_threshold
),
flush
=
True
)
print
(
' Ngrams
below
threshold {}'
.
format
(
counter_threshold
),
flush
=
True
)
fin
.
close
()
def
clean_ngrams_
above
_threshold
(
args
,
ngrams_
above
_threshold
,
dedup_file
,
\
def
clean_ngrams_
below
_threshold
(
args
,
ngrams_
below
_threshold
,
dedup_file
,
\
dedup_key
):
start_time
=
time
.
time
()
...
...
@@ -348,19 +348,19 @@ def clean_ngrams_above_threshold(args, ngrams_above_threshold, dedup_file, \
id_prefix
=
'-'
.
join
(
args
.
tasks
[::
2
])
# get the range of the size of the ngrams
ngrams_freq_sorted
=
compute_ngram_freq_sorted
(
args
,
ngrams_
above
_threshold
)
ngrams_freq_sorted
=
compute_ngram_freq_sorted
(
args
,
ngrams_
below
_threshold
)
# Open the large file to process in parallel
counter
=
splitted
=
ignored
=
split_mt_thld
=
trimmed_count
=
0
num_workers
=
40
pool
=
multiprocessing
.
Pool
(
num_workers
)
fin
=
open
(
dedup_file
,
'r'
,
encoding
=
'utf-8'
)
free_ngram_clean_partial
=
partial
(
free_ngram
,
args
=
args
,
key
=
dedup_key
,
\
ngrams
=
ngrams_
above
_threshold
,
ngrams_freq_sorted
=
ngrams_freq_sorted
)
ngrams
=
ngrams_
below
_threshold
,
ngrams_freq_sorted
=
ngrams_freq_sorted
)
free_ngrams_clean
=
pool
.
imap
(
free_ngram_clean_partial
,
fin
,
500
)
out_f
=
open
(
args
.
output
,
'wb'
)
counter
=
splitted
=
ignored
=
split_mt_thld
=
trimmed_count
=
0
for
text_buf_ngram_free
,
trimmed
,
_
in
free_ngrams_clean
:
counter
+=
1
try
:
...
...
@@ -460,22 +460,22 @@ if __name__ == '__main__':
ngrams_freq_sorted
=
compute_ngram_freq_sorted
(
args
,
ngrams
)
# get ngram freq from large file in parallel
# get ngrams
above
threshold
ngrams_
above
_threshold
=
{}
get_ngrams_
above
_threshold
(
args
,
ngrams
,
ngrams_
above
_threshold
,
\
# get ngrams
below
threshold
ngrams_
below
_threshold
=
{}
get_ngrams_
below
_threshold
(
args
,
ngrams
,
ngrams_
below
_threshold
,
\
dedup_file
,
dedup_key
,
ngrams_freq_sorted
)
# save the dictionary if needed
if
args
.
save_dictionary
is
not
None
:
with
open
(
args
.
save_dictionary
,
'wb'
)
as
save_dict_handle
:
pickle
.
dump
(
ngrams_
above
_threshold
,
save_dict_handle
)
pickle
.
dump
(
ngrams_
below
_threshold
,
save_dict_handle
)
else
:
with
open
(
args
.
load_dictionary
,
'rb'
)
as
load_dict_handle
:
ngrams_
above
_threshold
=
pickle
.
load
(
load_dict_handle
)
ngrams_
below
_threshold
=
pickle
.
load
(
load_dict_handle
)
# filter the large file
if
args
.
output
is
not
None
:
clean_ngrams_
above
_threshold
(
args
,
ngrams_
above
_threshold
,
\
clean_ngrams_
below
_threshold
(
args
,
ngrams_
below
_threshold
,
\
dedup_file
,
dedup_key
)
print
(
'done :-)'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment