Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
7fc96111
Commit
7fc96111
authored
Nov 17, 2022
by
Vijay Korthikanti
Committed by
Mike Chrzanowski
Nov 17, 2022
Browse files
Data Preprocessing Optimizations
parent
1ad1e1b1
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
497 additions
and
5 deletions
+497
-5
megatron/data/indexed_dataset.py
megatron/data/indexed_dataset.py
+9
-1
tools/preprocess_data.py
tools/preprocess_data.py
+9
-4
tools/preprocess_data_nmt.py
tools/preprocess_data_nmt.py
+113
-0
tools/preprocess_data_partitions.py
tools/preprocess_data_partitions.py
+366
-0
No files found.
megatron/data/indexed_dataset.py
View file @
7fc96111
...
@@ -484,7 +484,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
...
@@ -484,7 +484,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
# @lru_cache(maxsize=8)
# @lru_cache(maxsize=8)
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
if
isinstance
(
idx
,
int
):
if
isinstance
(
idx
,
(
int
,
np
.
integer
)
):
ptr
,
size
=
self
.
_index
[
idx
]
ptr
,
size
=
self
.
_index
[
idx
]
np_array
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
self
.
_index
.
dtype
,
np_array
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
self
.
_index
.
dtype
,
count
=
size
,
offset
=
ptr
)
count
=
size
,
offset
=
ptr
)
...
@@ -501,6 +501,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
...
@@ -501,6 +501,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
count
=
total_size
,
offset
=
ptr
)
count
=
total_size
,
offset
=
ptr
)
sents
=
np
.
split
(
np_array
,
offsets
[:
-
1
])
sents
=
np
.
split
(
np_array
,
offsets
[:
-
1
])
return
sents
return
sents
else
:
raise
TypeError
(
"Unexpected type received for idx: {}"
.
format
(
type
(
idx
)))
def
get
(
self
,
idx
,
offset
=
0
,
length
=
None
):
def
get
(
self
,
idx
,
offset
=
0
,
length
=
None
):
""" Retrieves a single item from the dataset with the option to only
""" Retrieves a single item from the dataset with the option to only
...
@@ -553,6 +555,12 @@ class MMapIndexedDatasetBuilder(object):
...
@@ -553,6 +555,12 @@ class MMapIndexedDatasetBuilder(object):
self
.
_data_file
.
write
(
np_array
.
tobytes
(
order
=
'C'
))
self
.
_data_file
.
write
(
np_array
.
tobytes
(
order
=
'C'
))
self
.
_sizes
.
append
(
np_array
.
size
)
self
.
_sizes
.
append
(
np_array
.
size
)
def
add_doc
(
self
,
tensor
,
sizes
):
np_array
=
np
.
array
(
tensor
,
dtype
=
self
.
_dtype
)
self
.
_data_file
.
write
(
np_array
.
tobytes
(
order
=
'C'
))
self
.
_sizes
.
extend
(
sizes
)
self
.
_doc_idx
.
append
(
len
(
self
.
_sizes
))
def
end_document
(
self
):
def
end_document
(
self
):
self
.
_doc_idx
.
append
(
len
(
self
.
_sizes
))
self
.
_doc_idx
.
append
(
len
(
self
.
_sizes
))
...
...
tools/preprocess_data.py
View file @
7fc96111
...
@@ -50,12 +50,14 @@ class Encoder(object):
...
@@ -50,12 +50,14 @@ class Encoder(object):
if
not
nltk_available
:
if
not
nltk_available
:
print
(
"NLTK is not available to split sentences."
)
print
(
"NLTK is not available to split sentences."
)
exit
()
exit
()
splitter
=
nltk
.
load
(
"tokenizers/punkt/english.pickle"
)
library
=
"tokenizers/punkt/{}.pickle"
.
format
(
self
.
args
.
lang
)
print
(
"loading: "
+
library
)
splitter
=
nltk
.
load
(
library
)
if
self
.
args
.
keep_newlines
:
if
self
.
args
.
keep_newlines
:
# this prevents punkt from eating newlines after sentences
# this prevents punkt from eating newlines after sentences
Encoder
.
splitter
=
nltk
.
tokenize
.
punkt
.
PunktSentenceTokenizer
(
Encoder
.
splitter
=
nltk
.
tokenize
.
punkt
.
PunktSentenceTokenizer
(
train_text
=
splitter
.
_params
,
train_text
=
splitter
.
_params
,
lang_vars
=
CustomLanguageVars
())
lang_vars
=
CustomLanguageVars
())
else
:
else
:
Encoder
.
splitter
=
splitter
Encoder
.
splitter
=
splitter
...
@@ -92,7 +94,7 @@ def get_args():
...
@@ -92,7 +94,7 @@ def get_args():
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
],
'GPT2BPETokenizer'
,
'SentencePieceTokenizer'
],
help
=
'What type of tokenizer to use.'
)
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file'
)
help
=
'Path to the vocab file'
)
...
@@ -100,6 +102,8 @@ def get_args():
...
@@ -100,6 +102,8 @@ def get_args():
help
=
'Path to the BPE merge file (if necessary).'
)
help
=
'Path to the BPE merge file (if necessary).'
)
group
.
add_argument
(
'--append-eod'
,
action
=
'store_true'
,
group
.
add_argument
(
'--append-eod'
,
action
=
'store_true'
,
help
=
'Append an <eod> token to the end of a document.'
)
help
=
'Append an <eod> token to the end of a document.'
)
group
.
add_argument
(
'--lang'
,
type
=
str
,
default
=
'english'
,
help
=
'Language to use for NLTK-powered sentence splitting.'
)
group
=
parser
.
add_argument_group
(
title
=
'output data'
)
group
=
parser
.
add_argument_group
(
title
=
'output data'
)
...
@@ -184,6 +188,7 @@ def main():
...
@@ -184,6 +188,7 @@ def main():
print
(
f
"Processed
{
i
}
documents"
,
print
(
f
"Processed
{
i
}
documents"
,
f
"(
{
i
/
elapsed
}
docs/s,
{
mbs
}
MB/s)."
,
f
"(
{
i
/
elapsed
}
docs/s,
{
mbs
}
MB/s)."
,
file
=
sys
.
stderr
)
file
=
sys
.
stderr
)
print
(
"Done! Now finalizing."
)
for
key
in
args
.
json_keys
:
for
key
in
args
.
json_keys
:
builders
[
key
].
finalize
(
output_idx_files
[
key
])
builders
[
key
].
finalize
(
output_idx_files
[
key
])
...
...
tools/preprocess_data_nmt.py
0 → 100644
View file @
7fc96111
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Processing nmt data for finetuning."""
import
argparse
import
json
import
multiprocessing
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
path
.
pardir
)))
import
time
import
torch
from
megatron.tokenizer
import
build_tokenizer
from
megatron.data
import
indexed_dataset
class
Encoder
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
initializer
(
self
):
# Use Encoder class as a container for global data
Encoder
.
tokenizer
=
build_tokenizer
(
self
.
args
)
def
encode
(
self
,
text
):
ids
=
{}
ids
=
Encoder
.
tokenizer
.
tokenize
(
text
)
assert
len
(
ids
)
>
0
return
ids
,
len
(
text
)
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
group
=
parser
.
add_argument_group
(
title
=
'input data'
)
group
.
add_argument
(
'--input'
,
type
=
str
,
required
=
True
,
help
=
'Path to input JSON'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
default
=
'YTTMTokenizer'
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
,
'SentencePieceTokenizer'
],
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the BPE merge file (if necessary).'
)
group
=
parser
.
add_argument_group
(
title
=
'output data'
)
group
.
add_argument
(
'--output-prefix'
,
type
=
str
,
required
=
True
,
help
=
'Path to binary output file without suffix'
)
group
.
add_argument
(
'--dataset-impl'
,
type
=
str
,
default
=
'mmap'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
])
group
=
parser
.
add_argument_group
(
title
=
'runtime'
)
group
.
add_argument
(
'--workers'
,
type
=
int
,
default
=
1
,
help
=
'Number of worker processes to launch'
)
group
.
add_argument
(
'--log-interval'
,
type
=
int
,
default
=
100
,
help
=
'Interval between progress updates'
)
args
=
parser
.
parse_args
()
args
.
keep_empty
=
False
# some default/dummy values for the tokenizer
args
.
rank
=
0
args
.
make_vocab_size_divisible_by
=
128
args
.
tensor_model_parallel_size
=
1
args
.
vocab_extra_ids
=
0
return
args
def
main
():
args
=
get_args
()
startup_start
=
time
.
time
()
print
(
"Opening"
,
args
.
input
)
fin
=
open
(
args
.
input
,
'r'
,
encoding
=
'utf-8'
)
encoder
=
Encoder
(
args
)
tokenizer
=
build_tokenizer
(
args
)
pool
=
multiprocessing
.
Pool
(
args
.
workers
,
initializer
=
encoder
.
initializer
)
encoded_sentences
=
pool
.
imap
(
encoder
.
encode
,
fin
,
25
)
print
(
f
"Vocab size:
{
tokenizer
.
vocab_size
}
"
)
print
(
f
"Output prefix:
{
args
.
output_prefix
}
"
)
output_bin_file
=
"{}.bin"
.
format
(
args
.
output_prefix
)
output_idx_file
=
"{}.idx"
.
format
(
args
.
output_prefix
)
builder
=
indexed_dataset
.
make_builder
(
output_bin_file
,
impl
=
args
.
dataset_impl
,
vocab_size
=
tokenizer
.
vocab_size
)
startup_end
=
time
.
time
()
proc_start
=
time
.
time
()
total_bytes_processed
=
0
print
(
"Time to startup:"
,
startup_end
-
startup_start
)
for
i
,
(
sentence
,
bytes_processed
)
in
enumerate
(
encoded_sentences
,
start
=
1
):
total_bytes_processed
+=
bytes_processed
builder
.
add_item
(
torch
.
IntTensor
(
sentence
))
# documents contain only one sentence.
builder
.
end_document
()
if
i
%
args
.
log_interval
==
0
:
current
=
time
.
time
()
elapsed
=
current
-
proc_start
mbs
=
total_bytes_processed
/
elapsed
/
1024
/
1024
print
(
f
"Processed
{
i
}
sentences"
,
f
"(
{
i
/
elapsed
}
sentences/s,
{
mbs
}
MB/s)."
,
file
=
sys
.
stderr
)
builder
.
finalize
(
output_idx_file
)
if
__name__
==
'__main__'
:
main
()
tools/preprocess_data_partitions.py
0 → 100644
View file @
7fc96111
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Processing large data for pretraining."""
import
argparse
import
math
import
json
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
path
.
pardir
)))
import
time
import
gzip
import
glob
import
torch
import
numpy
as
np
import
multiprocessing
try
:
import
nltk
nltk_available
=
True
except
ImportError
:
nltk_available
=
False
from
megatron.tokenizer
import
build_tokenizer
from
megatron.data
import
indexed_dataset
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
class
CustomLanguageVars
(
nltk
.
tokenize
.
punkt
.
PunktLanguageVars
):
_period_context_fmt
=
r
"""
\S* # some word material
%(SentEndChars)s # a potential sentence ending
\s* # <-- THIS is what I changed
(?=(?P<after_tok>
%(NonWord)s # either other punctuation
|
(?P<next_tok>\S+) # <-- Normally you would have \s+ here
))"""
class
IdentitySplitter
(
object
):
def
tokenize
(
self
,
*
text
):
return
text
class
Encoder
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
initializer
(
self
):
# Use Encoder class as a container for global data
Encoder
.
tokenizer
=
build_tokenizer
(
self
.
args
)
if
self
.
args
.
split_sentences
:
if
not
nltk_available
:
print
(
"NLTK is not available to split sentences."
)
exit
()
library
=
"tokenizers/punkt/{}.pickle"
.
format
(
self
.
args
.
lang
)
splitter
=
nltk
.
load
(
library
)
if
self
.
args
.
keep_newlines
:
# this prevents punkt from eating newlines after sentences
Encoder
.
splitter
=
nltk
.
tokenize
.
punkt
.
PunktSentenceTokenizer
(
train_text
=
splitter
.
_params
,
lang_vars
=
CustomLanguageVars
())
else
:
Encoder
.
splitter
=
splitter
else
:
Encoder
.
splitter
=
IdentitySplitter
()
def
split
(
self
,
json_line
):
data
=
json
.
loads
(
json_line
)
output
=
{}
for
key
in
self
.
args
.
json_keys
:
text
=
data
[
key
]
max_len
=
1000000
tokens_list
=
[
Encoder
.
splitter
.
tokenize
(
text
[
i
:
i
+
max_len
])
for
i
in
range
(
0
,
len
(
text
),
max_len
)]
output
[
key
]
=
[
tokens
for
partial
in
tokens_list
for
tokens
in
partial
]
return
json
.
dumps
(
output
),
len
(
json_line
)
def
encode
(
self
,
json_line
):
data
=
json
.
loads
(
json_line
)
ids
=
{}
lens
=
{}
for
key
in
self
.
args
.
json_keys
:
text
=
data
[
key
]
if
isinstance
(
text
,
list
):
sentences
=
text
else
:
sentences
=
[
text
]
doc_ids
=
[]
sentence_lens
=
[]
for
sentence
in
sentences
:
sentence_ids
=
Encoder
.
tokenizer
.
tokenize
(
sentence
)
if
len
(
sentence_ids
)
>
0
:
doc_ids
.
extend
(
sentence_ids
)
sentence_lens
.
append
(
len
(
sentence_ids
))
if
len
(
doc_ids
)
>
0
and
self
.
args
.
append_eod
:
doc_ids
.
append
(
Encoder
.
tokenizer
.
eod
)
ids
[
key
]
=
doc_ids
lens
[
key
]
=
sentence_lens
return
ids
,
lens
,
len
(
json_line
)
class
Partition
(
object
):
def
__init__
(
self
,
args
,
workers
):
self
.
args
=
args
self
.
workers
=
workers
def
print_processing_stats
(
self
,
count
,
proc_start
,
total_bytes_processed
):
if
count
%
self
.
args
.
log_interval
==
0
:
current
=
time
.
time
()
elapsed
=
current
-
proc_start
mbs
=
total_bytes_processed
/
elapsed
/
1024
/
1024
print
(
f
"Processed
{
count
}
documents"
,
f
"(
{
count
/
elapsed
}
docs/s,
{
mbs
}
MB/s)."
,
file
=
sys
.
stderr
)
def
split_sentences
(
self
,
file_name
):
input_file_name
,
output_file_name
=
file_name
print
(
"Opening"
,
input_file_name
)
fin
=
open
(
input_file_name
,
'r'
,
encoding
=
'utf-8'
)
fout
=
open
(
output_file_name
,
'w'
)
encoder
=
Encoder
(
self
.
args
)
pool
=
multiprocessing
.
Pool
(
self
.
workers
,
initializer
=
encoder
.
initializer
)
split_docs
=
pool
.
imap
(
encoder
.
split
,
fin
,
32
)
proc_start
=
time
.
time
()
total_bytes_processed
=
0
for
i
,
(
doc
,
bytes_processed
)
in
enumerate
(
split_docs
,
start
=
1
):
total_bytes_processed
+=
bytes_processed
fout
.
write
(
doc
+
"
\n
"
)
self
.
print_processing_stats
(
i
,
proc_start
,
total_bytes_processed
)
fin
.
close
()
fout
.
close
()
def
process_json_file
(
self
,
file_name
):
input_file_name
,
output_prefix
=
file_name
print
(
"Opening"
,
input_file_name
)
fin
=
open
(
input_file_name
,
'r'
,
encoding
=
'utf-8'
)
startup_start
=
time
.
time
()
encoder
=
Encoder
(
self
.
args
)
tokenizer
=
build_tokenizer
(
self
.
args
)
pool
=
multiprocessing
.
Pool
(
self
.
workers
,
initializer
=
encoder
.
initializer
)
encoded_docs
=
pool
.
imap
(
encoder
.
encode
,
fin
,
32
)
level
=
"document"
if
self
.
args
.
split_sentences
:
level
=
"sentence"
output_bin_files
=
{}
output_idx_files
=
{}
builders
=
{}
for
key
in
self
.
args
.
json_keys
:
output_bin_files
[
key
]
=
"{}_{}_{}.bin"
.
format
(
output_prefix
,
key
,
level
)
output_idx_files
[
key
]
=
"{}_{}_{}.idx"
.
format
(
output_prefix
,
key
,
level
)
builders
[
key
]
=
indexed_dataset
.
make_builder
(
output_bin_files
[
key
],
impl
=
self
.
args
.
dataset_impl
,
vocab_size
=
tokenizer
.
vocab_size
)
startup_end
=
time
.
time
()
proc_start
=
time
.
time
()
total_bytes_processed
=
0
print
(
"Time to startup:"
,
startup_end
-
startup_start
)
for
i
,
(
doc
,
sentence_lens
,
bytes_processed
)
in
enumerate
(
encoded_docs
,
start
=
1
):
total_bytes_processed
+=
bytes_processed
for
key
in
doc
.
keys
():
builders
[
key
].
add_doc
(
doc
[
key
],
sentence_lens
[
key
])
self
.
print_processing_stats
(
i
,
proc_start
,
total_bytes_processed
)
fin
.
close
()
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
group
=
parser
.
add_argument_group
(
title
=
'input data'
)
group
.
add_argument
(
'--input'
,
type
=
str
,
required
=
True
,
help
=
'Path to input JSON'
)
group
.
add_argument
(
'--json-keys'
,
nargs
=
'+'
,
default
=
[
'text'
],
help
=
'space separate listed of keys to extract from json'
)
group
.
add_argument
(
'--split-sentences'
,
action
=
'store_true'
,
help
=
'Split documents into sentences.'
)
group
.
add_argument
(
'--keep-newlines'
,
action
=
'store_true'
,
help
=
'Keep newlines between sentences when splitting.'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BertWordPieceLowerCase'
,
'BertWordPieceCase'
,
'GPT2BPETokenizer'
,
'SentencePieceTokenizer'
],
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--tokenizer-model'
,
type
=
str
,
default
=
None
,
help
=
'YTTM tokenizer model.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the BPE merge file (if necessary).'
)
group
.
add_argument
(
'--append-eod'
,
action
=
'store_true'
,
help
=
'Append an <eod> token to the end of a document.'
)
group
.
add_argument
(
'--lang'
,
type
=
str
,
default
=
'english'
,
help
=
'Language to use for NLTK-powered sentence splitting.'
)
group
=
parser
.
add_argument_group
(
title
=
'output data'
)
group
.
add_argument
(
'--output-prefix'
,
type
=
str
,
required
=
True
,
help
=
'Path to binary output file without suffix'
)
group
.
add_argument
(
'--dataset-impl'
,
type
=
str
,
default
=
'mmap'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
])
group
=
parser
.
add_argument_group
(
title
=
'runtime'
)
group
.
add_argument
(
'--workers'
,
type
=
int
,
default
=
1
,
help
=
'Number of worker processes to launch'
)
group
.
add_argument
(
'--partitions'
,
type
=
int
,
default
=
1
,
help
=
'Number of file partitions'
)
group
.
add_argument
(
'--log-interval'
,
type
=
int
,
default
=
1000
,
help
=
'Interval between progress updates'
)
args
=
parser
.
parse_args
()
args
.
keep_empty
=
False
if
(
args
.
tokenizer_type
.
lower
().
startswith
(
'bert'
)
if
not
args
.
split_sentences
:
print
(
"Are you sure you don't want to split sentences?"
)
# some default/dummy values for the tokenizer
args
.
rank
=
1
args
.
make_vocab_size_divisible_by
=
128
args
.
tensor_model_parallel_size
=
1
args
.
vocab_extra_ids
=
0
return
args
def
get_file_name
(
args
,
file_id
):
file_name
,
extension
=
os
.
path
.
splitext
(
args
.
input
)
input_file_name
=
file_name
+
"_"
+
str
(
file_id
)
+
extension
sentence_split_file
=
file_name
+
"_ss_"
+
str
(
file_id
)
+
extension
output_prefix
=
args
.
output_prefix
+
"_"
+
str
(
file_id
)
file_names
=
{
'partition'
:
input_file_name
,
'sentence_split'
:
sentence_split_file
,
'output_prefix'
:
output_prefix
}
return
file_names
def
check_files_exist
(
in_ss_out_names
,
key
,
num_partitions
):
for
i
in
range
(
num_partitions
):
if
not
os
.
path
.
exists
(
in_ss_out_names
[
i
][
key
]):
return
False
return
True
def
main
():
args
=
get_args
()
if
args
.
split_sentences
:
if
nltk_available
:
nltk
.
download
(
"punkt"
,
quiet
=
True
)
else
:
raise
Exception
(
"nltk library required for sentence splitting is not available."
)
in_ss_out_names
=
[]
if
args
.
partitions
==
1
:
file_name
,
extension
=
os
.
path
.
splitext
(
args
.
input
)
sentence_split_file
=
file_name
+
"_ss"
+
extension
in_ss_out_names
.
append
((
args
.
input
,
sentence_split_file
,
args
.
output_prefix
))
else
:
in_file_names
=
glob
.
glob
(
args
.
input
)
# create .jsonl parition files
for
idx
in
range
(
args
.
partitions
):
in_ss_out_name
=
get_file_name
(
args
,
idx
)
in_ss_out_names
.
append
(
in_ss_out_name
)
# check to see if paritions were already created
partitions_present
=
check_files_exist
(
in_ss_out_names
,
'partition'
,
args
.
partitions
)
# check to see if paritions with split sentences already created
split_sentences_present
=
check_files_exist
(
in_ss_out_names
,
'sentence_split'
,
args
.
partitions
)
if
not
partitions_present
and
not
split_sentences_present
:
# populate .jsonl partition files from parent files
partitioned_input_files
=
[]
for
idx
in
range
(
args
.
partitions
):
partitioned_input_file
=
open
(
in_ss_out_names
[
idx
][
'partition'
],
'w'
)
partitioned_input_files
.
append
(
partitioned_input_file
)
index
=
0
for
in_file_name
in
in_file_names
:
# support for gzip files
if
in_file_name
.
endswith
(
".gz"
):
fin
=
gzip
.
open
(
in_file_name
,
'rt'
)
else
:
fin
=
open
(
in_file_name
,
'r'
,
encoding
=
'utf-8'
)
for
line
in
fin
:
partitioned_input_files
[
index
].
write
(
line
)
index
=
(
index
+
1
)
%
args
.
partitions
fin
.
close
()
for
idx
in
range
(
args
.
partitions
):
partitioned_input_files
[
idx
].
close
()
assert
args
.
workers
%
args
.
partitions
==
0
partition
=
Partition
(
args
,
args
.
workers
//
args
.
partitions
)
# check to see if paritions with split sentences already created
split_sentences_present
=
check_files_exist
(
in_ss_out_names
,
'sentence_split'
,
args
.
partitions
)
# split sentences in partition files
if
args
.
split_sentences
and
not
split_sentences_present
:
processes
=
[]
for
name
in
in_ss_out_names
:
p
=
multiprocessing
.
Process
(
target
=
partition
.
split_sentences
,
args
=
((
name
[
'partition'
],
name
[
'sentence_split'
]),))
p
.
start
()
processes
.
append
(
p
)
for
p
in
processes
:
p
.
join
()
# encode partition files in parallel
processes
=
[]
input_key
=
'sentence_split'
if
args
.
split_sentences
else
'partition'
for
name
in
in_ss_out_names
:
p
=
multiprocessing
.
Process
(
target
=
partition
.
process_json_file
,
args
=
((
name
[
input_key
],
name
[
'output_prefix'
]),))
p
.
start
()
processes
.
append
(
p
)
for
p
in
processes
:
p
.
join
()
# merge bin/idx partitions
level
=
"document"
if
args
.
split_sentences
:
level
=
"sentence"
output_bin_files
=
{}
output_idx_files
=
{}
builders
=
{}
tokenizer
=
build_tokenizer
(
args
)
for
key
in
args
.
json_keys
:
output_bin_files
[
key
]
=
"{}_{}_{}.bin"
.
format
(
args
.
output_prefix
,
key
,
level
)
output_idx_files
[
key
]
=
"{}_{}_{}.idx"
.
format
(
args
.
output_prefix
,
key
,
level
)
builders
[
key
]
=
indexed_dataset
.
make_builder
(
output_bin_files
[
key
],
impl
=
args
.
dataset_impl
,
vocab_size
=
tokenizer
.
vocab_size
)
for
name
in
in_ss_out_names
:
parition_output_prefix
=
name
[
'output_prefix'
]
full_partition_output_prefix
=
"{}_{}_{}"
.
format
(
parition_output_prefix
,
key
,
level
)
builders
[
key
].
merge_file_
(
full_partition_output_prefix
)
builder
[
key
].
finalize
(
output_idx_files
[
key
])
if
__name__
==
'__main__'
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment