Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
3f4bc91b
Commit
3f4bc91b
authored
Nov 08, 2019
by
Jared Casper
Browse files
Skip any empty sentences during preprocessing.
parent
61697cab
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
5 deletions
+8
-5
megatron/data/preprocess_data.py
megatron/data/preprocess_data.py
+2
-1
megatron/data/test/test_indexed_dataset.py
megatron/data/test/test_indexed_dataset.py
+6
-4
No files found.
megatron/data/preprocess_data.py
View file @
3f4bc91b
...
@@ -44,7 +44,8 @@ class Encoder(object):
...
@@ -44,7 +44,8 @@ class Encoder(object):
for
sentence
in
Encoder
.
splitter
.
tokenize
(
text
):
for
sentence
in
Encoder
.
splitter
.
tokenize
(
text
):
tokens
=
Encoder
.
tokenizer
.
tokenize
(
sentence
)
tokens
=
Encoder
.
tokenizer
.
tokenize
(
sentence
)
ids
=
Encoder
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids
=
Encoder
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
doc_ids
.
append
(
ids
)
if
len
(
ids
)
>
0
:
doc_ids
.
append
(
ids
)
return
doc_ids
,
len
(
json_line
)
return
doc_ids
,
len
(
json_line
)
def
main
():
def
main
():
...
...
megatron/data/test/test_indexed_dataset.py
View file @
3f4bc91b
...
@@ -18,16 +18,18 @@ def test_indexed_dataset(args):
...
@@ -18,16 +18,18 @@ def test_indexed_dataset(args):
if
ds
.
supports_prefetch
:
if
ds
.
supports_prefetch
:
# just prefetch the whole thing in test (so assume it is small)
# just prefetch the whole thing in test (so assume it is small)
ds
.
prefetch
(
range
(
len
(
ds
)))
ds
.
prefetch
(
range
(
len
(
ds
)))
for
i
in
range
(
2
):
for
i
in
range
(
len
(
ds
.
doc_idx
)
-
1
):
start
=
ds
.
doc_idx
[
i
]
start
=
ds
.
doc_idx
[
i
]
end
=
ds
.
doc_idx
[
i
+
1
]
end
=
ds
.
doc_idx
[
i
+
1
]
ids
=
ds
[
start
:
end
]
ids
=
ds
[
start
:
end
]
for
s
in
ids
:
for
s
in
ids
:
assert
len
(
s
)
>
0
l
=
s
.
data
.
tolist
()
l
=
s
.
data
.
tolist
()
print
(
l
)
tokens
=
tokenizer
.
convert_ids_to_tokens
(
l
)
tokens
=
tokenizer
.
convert_ids_to_tokens
(
l
)
print
(
tokens
)
for
t
in
tokens
:
print
(
"******** END DOCUMENT **********"
)
if
'
\n
'
in
t
:
print
(
"Newline in string!"
)
print
(
i
)
def
main
():
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment