Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
d413bd5f
Commit
d413bd5f
authored
Apr 01, 2021
by
Mostofa Patwary
Browse files
More features added
parent
f559787d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
1 deletion
+13
-1
tools/openwebtext/cleanup_fix_dataset.py
tools/openwebtext/cleanup_fix_dataset.py
+13
-1
No files found.
tools/openwebtext/cleanup_fix_dataset.py
View file @
d413bd5f
...
@@ -69,6 +69,18 @@ def process_doc(json_line, args):
...
@@ -69,6 +69,18 @@ def process_doc(json_line, args):
# Cleaning extra spaces and newlines
# Cleaning extra spaces and newlines
if
"general_cleaning"
in
args
.
tasks
:
if
"general_cleaning"
in
args
.
tasks
:
cleaned_text
=
re
.
sub
(
r
" +|\b\n+ |\b\n+"
,
" "
,
text
)
cleaned_text
=
re
.
sub
(
r
" +|\b\n+ |\b\n+"
,
" "
,
text
)
#cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset
#cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews
# stories datasets
#cleaned_text = re.sub(r" \'", "'", text)
#cleaned_text = re.sub(r" \!", "!", cleaned_text)
#cleaned_text = re.sub(r" \.", ".", cleaned_text)
#cleaned_text = re.sub(r" \?", "?", cleaned_text)
#cleaned_text = re.sub(r" - ", "-", cleaned_text)
##cleaned_text = re.sub(r"\" ", "\"", cleaned_text)
#cleaned_text = re.sub(r" @ ", "@", cleaned_text)
output
[
'general_cleaning'
]
=
True
output
[
'general_cleaning'
]
=
True
return
output
,
cleaned_text
,
document
,
False
return
output
,
cleaned_text
,
document
,
False
...
@@ -110,7 +122,7 @@ def process_set(args, input_file, output_f_cleaned, output_f_filtered):
...
@@ -110,7 +122,7 @@ def process_set(args, input_file, output_f_cleaned, output_f_filtered):
num_remove_512_non_english
+=
1
if
output
[
'remove_512_non_english'
]
\
num_remove_512_non_english
+=
1
if
output
[
'remove_512_non_english'
]
\
else
0
else
0
num_ftfy_fix_text
+=
1
if
output
[
'ftfy_fix_text'
]
else
0
num_ftfy_fix_text
+=
1
if
output
[
'ftfy_fix_text'
]
else
0
num_general_cleaning
+
1
if
output
[
'general_cleaning'
]
else
0
num_general_cleaning
+
=
1
if
output
[
'general_cleaning'
]
else
0
document
[
'text'
]
=
text
document
[
'text'
]
=
text
myjson
=
json
.
dumps
(
document
,
ensure_ascii
=
False
)
myjson
=
json
.
dumps
(
document
,
ensure_ascii
=
False
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment