Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
27ecc17a
Commit
27ecc17a
authored
Apr 13, 2020
by
Neel Kant
Browse files
Modify preprocess_data.py to accommodate titles dataset
parent
7bd2a3c5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
0 deletions
+11
-0
megatron/data/preprocess_data.py
megatron/data/preprocess_data.py
+11
-0
No files found.
megatron/data/preprocess_data.py
View file @
27ecc17a
import
argparse
import
itertools
import
json
import
multiprocessing
import
nltk
...
...
@@ -43,18 +44,28 @@ class Encoder(object):
def
encode
(
self
,
json_line
):
text
=
json
.
loads
(
json_line
)[
self
.
args
.
json_key
]
if
not
text
:
text
=
"no text"
doc_ids
=
[]
for
sentence
in
Encoder
.
splitter
.
tokenize
(
text
):
tokens
=
Encoder
.
tokenizer
.
tokenize
(
sentence
)
ids
=
Encoder
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
if
len
(
ids
)
>
0
:
doc_ids
.
append
(
ids
)
else
:
print
(
"no ids!"
,
flush
=
True
)
tokens
=
Encoder
.
tokenizer
.
tokenize
(
"no text"
)
ids
=
Encoder
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
doc_ids
.
append
(
ids
)
if
self
.
args
.
flatten
and
len
(
doc_ids
)
>
1
:
doc_ids
=
[
list
(
itertools
.
chain
(
*
doc_ids
))]
return
doc_ids
,
len
(
json_line
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--input'
,
type
=
str
,
help
=
'Path to input JSON'
)
parser
.
add_argument
(
'--vocab'
,
type
=
str
,
help
=
'Path to vocab.txt'
)
parser
.
add_argument
(
'--flatten'
,
action
=
'store_true'
,
help
=
'Path to input JSON'
)
parser
.
add_argument
(
'--json-key'
,
type
=
str
,
default
=
'text'
,
help
=
'Key to extract from json'
)
parser
.
add_argument
(
'--output-prefix'
,
type
=
str
,
help
=
'Path to binary output file without suffix'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment