Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
6140718f
Commit
6140718f
authored
Nov 08, 2019
by
Mohammad Shoeybi
Browse files
before optimization
parent
c125d247
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
85 additions
and
64 deletions
+85
-64
megatron/data/dataset.py
megatron/data/dataset.py
+72
-57
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+13
-7
No files found.
megatron/data/dataset.py
View file @
6140718f
...
@@ -7,18 +7,55 @@ import numpy as np
...
@@ -7,18 +7,55 @@ import numpy as np
import
torch
import
torch
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
from
dataset_utils
import
build_training_sample
# WILL BE REPLACED WITH JARED'S
class
JaredDataset
(
object
):
def
__init__
(
self
,
doc_idx
,
sizes
,
sentences
):
class
AlbertDataSet
(
Dataset
):
self
.
doc_idx
=
doc_idx
self
.
num_docs
=
len
(
self
.
doc_idx
)
-
1
def
__init__
(
self
,
indexed_dataset
,
tokenizer
,
num_epochs
,
self
.
sizes
=
sizes
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
):
self
.
sentences
=
sentences
# Params to store.
self
.
seed
=
seed
self
.
masked_lm_prob
=
masked_lm_prob
self
.
max_seq_length
=
max_seq_length
# Indexed dataset.
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
self
.
samples_mapping
=
build_training_samples_mapping
(
indexed_dataset
,
num_epochs
,
self
.
max_seq_length
,
short_seq_prob
,
self
.
seed
)
# Vocab stuff.
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_dict
=
tokenizer
.
inv_vocab
self
.
cls_id
=
tokenizer
.
vocab
[
'[CLS]'
]
self
.
sep_id
=
tokenizer
.
vocab
[
'[SEP]'
]
self
.
mask_id
=
tokenizer
.
vocab
[
'[MASK]'
]
self
.
pad_id
=
tokenizer
.
vocab
[
'[PAD]'
]
def
__len__
(
self
):
return
self
.
samples
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
return
self
.
sentences
[
idx
]
rng
=
random
.
Random
(
self
.
seed
+
idx
)
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
return
build_training_sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
rng
)
def
get_target_seq_length
(
max_num_tokens
,
short_seq_prob
,
np_rng
):
def
get_target_seq_length
(
max_num_tokens
,
short_seq_prob
,
np_rng
):
...
@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
...
@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
while
sent_index
<
sent_index_last
:
while
sent_index
<
sent_index_last
:
# Get the size.
# Get the size.
assert
indexed_dataset
.
sizes
[
sent_index
]
>
0
size
+=
indexed_dataset
.
sizes
[
sent_index
]
size
+=
indexed_dataset
.
sizes
[
sent_index
]
sent_index
+=
1
sent_index
+=
1
...
@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
...
@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
return
samples_np
return
samples_np
class
AlbertDataSet
(
Dataset
):
# WILL BE REPLACED WITH JARED'S
class
JaredDataset
(
object
):
def
__init__
(
self
,
indexed_dataset
,
tokenizer
,
num_epochs
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
):
# Params to store.
self
.
seed
=
seed
self
.
masked_lm_prob
=
masked_lm_prob
self
.
max_seq_length
=
max_seq_length
# Indexed dataset.
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
self
.
samples_mapping
=
build_training_samples_mapping
(
indexed_dataset
,
num_epochs
,
self
.
max_seq_length
,
short_seq_prob
,
self
.
seed
)
# Vocab stuff.
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_dict
=
tokenizer
.
inv_vocab
self
.
cls_id
=
tokenizer
.
vocab
[
'[CLS]'
]
self
.
sep_id
=
tokenizer
.
vocab
[
'[SEP]'
]
self
.
mask_id
=
tokenizer
.
vocab
[
'[MASK]'
]
self
.
pad_id
=
tokenizer
.
vocab
[
'[PAD]'
]
def
__len__
(
self
):
def
__init__
(
self
,
doc_idx
,
sizes
,
sentences
):
return
self
.
samples
.
shape
[
0
]
self
.
doc_idx
=
doc_idx
self
.
num_docs
=
len
(
self
.
doc_idx
)
-
1
self
.
sizes
=
sizes
self
.
sentences
=
sentences
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
rng
=
random
.
Random
(
self
.
seed
+
idx
)
return
self
.
sentences
[
idx
]
start_index
,
end_index
=
self
.
samples_mapping
[
idx
]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
return
build_training_sample
(
sample
,
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
self
.
max_seq_length
,
rng
)
...
@@ -198,10 +202,12 @@ if __name__ == '__main__':
...
@@ -198,10 +202,12 @@ if __name__ == '__main__':
sentences
=
[]
sentences
=
[]
for
line
in
text
.
split
(
'
\n
'
):
for
line
in
text
.
split
(
'
\n
'
):
if
line
!=
'
\n
'
:
if
line
!=
'
\n
'
:
sentences
.
extend
(
nltk
.
tokenize
.
sent_tokenize
(
line
))
sent
=
nltk
.
tokenize
.
sent_tokenize
(
line
)
if
sent
:
sentences
.
extend
(
sent
)
yield
sentences
yield
sentences
input_file
=
'/raid/mshoeybi/data/albert/sample/samples_1
1
.json'
input_file
=
'/raid/mshoeybi/data/albert/sample/samples_1
000
.json'
vocab_file
=
'/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
vocab_file
=
'/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
tokenizer
=
FullTokenizer
(
vocab_file
,
do_lower_case
=
True
)
tokenizer
=
FullTokenizer
(
vocab_file
,
do_lower_case
=
True
)
...
@@ -212,19 +218,28 @@ if __name__ == '__main__':
...
@@ -212,19 +218,28 @@ if __name__ == '__main__':
sentences_list
=
[]
sentences_list
=
[]
for
sentences
in
document_generator
:
for
sentences
in
document_generator
:
doc_idx
.
append
(
len
(
sentences
))
num_sent
=
0
for
sentence
in
sentences
:
for
sentence
in
sentences
:
tokens
=
tokenizer
.
tokenize
(
sentence
)
tokens
=
tokenizer
.
tokenize
(
sentence
)
if
tokens
:
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
if
len
(
ids
)
==
0
:
print
(
'****************'
)
print
(
sentence
)
print
(
tokens
)
print
(
ids
)
print
(
'****************'
)
sizes
.
append
(
len
(
ids
))
sizes
.
append
(
len
(
ids
))
sentences_list
.
append
(
ids
)
sentences_list
.
append
(
ids
)
num_sent
+=
1
doc_idx
.
append
(
num_sent
)
for
i
in
range
(
1
,
len
(
doc_idx
)):
for
i
in
range
(
1
,
len
(
doc_idx
)):
doc_idx
[
i
]
+=
doc_idx
[
i
-
1
]
doc_idx
[
i
]
+=
doc_idx
[
i
-
1
]
indexed_dataset
=
JaredDataset
(
doc_idx
,
sizes
,
sentences_list
)
indexed_dataset
=
JaredDataset
(
doc_idx
,
sizes
,
sentences_list
)
dataset
=
AlbertDataSet
(
indexed_dataset
=
indexed_dataset
,
dataset
=
AlbertDataSet
(
indexed_dataset
=
indexed_dataset
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
num_epochs
=
3
,
num_epochs
=
10
,
masked_lm_prob
=
0.15
,
masked_lm_prob
=
0.15
,
max_seq_length
=
512
,
max_seq_length
=
512
,
short_seq_prob
=
0.1
,
short_seq_prob
=
0.1
,
...
...
megatron/data/dataset_utils.py
View file @
6140718f
...
@@ -5,13 +5,18 @@ import collections
...
@@ -5,13 +5,18 @@ import collections
import
numpy
as
np
import
numpy
as
np
def
build_training_sample
(
sample
,
vocab_id_list
,
vocab_id_to_token_dict
,
def
build_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
max_seq_length
,
rng
):
masked_lm_prob
,
rng
):
"""Biuld training sample.
"""Biuld training sample.
Arguments:
Arguments:
sample: A list of sentences in which each sentence is a list token ids.
sample: A list of sentences in which each sentence is a list token ids.
target_seq_length: Desired sequence length.
max_seq_length: Maximum length of the sequence. All values are padded to
this length.
vocab_id_list: List of vocabulary ids. Used to pick a random id.
vocab_id_list: List of vocabulary ids. Used to pick a random id.
vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
cls_id: Start of example id.
cls_id: Start of example id.
...
@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
...
@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
mask_id: Mask token id.
mask_id: Mask token id.
pad_id: Padding token id.
pad_id: Padding token id.
masked_lm_prob: Probability to mask tokens.
masked_lm_prob: Probability to mask tokens.
max_seq_length: Maximum length of the sequence. All values are padded to
this length.
rng: Random number genenrator.
rng: Random number genenrator.
"""
"""
# We assume that we have at least two sentences in the sample
# We assume that we have at least two sentences in the sample
assert
len
(
sample
)
>
1
assert
len
(
sample
)
>
1
assert
target_seq_length
<=
max_seq_length
# Divide sample into two segments (A and B).
# Divide sample into two segments (A and B).
tokens_a
,
tokens_b
,
is_next_random
=
get_a_and_b_segments
(
sample
,
rng
)
tokens_a
,
tokens_b
,
is_next_random
=
get_a_and_b_segments
(
sample
,
rng
)
# Truncate to `
max
_sequence_length`.
# Truncate to `
target
_sequence_length`.
# Note that we have account for [CLS] A [SEP] B [SEP]
# Note that we have account for [CLS] A [SEP] B [SEP]
max_num_tokens
=
max
_seq_length
-
3
max_num_tokens
=
target
_seq_length
-
3
truncate_segments
(
tokens_a
,
tokens_b
,
len
(
tokens_a
),
len
(
tokens_b
),
truncate_segments
(
tokens_a
,
tokens_b
,
len
(
tokens_a
),
len
(
tokens_b
),
max_num_tokens
,
rng
)
max_num_tokens
,
rng
)
...
@@ -421,11 +425,13 @@ if __name__ == '__main__':
...
@@ -421,11 +425,13 @@ if __name__ == '__main__':
for
s
in
samples
[
0
]:
for
s
in
samples
[
0
]:
sample
.
append
(
tokenizer
.
convert_tokens_to_ids
(
s
))
sample
.
append
(
tokenizer
.
convert_tokens_to_ids
(
s
))
max_seq_length
=
512
max_seq_length
=
512
target_seq_length
=
444
masked_lm_prob
=
0.15
masked_lm_prob
=
0.15
example
=
build_training_sample
(
sample
,
example
=
build_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
max_seq_length
,
rng
)
masked_lm_prob
,
rng
)
orig_tokens
=
[]
orig_tokens
=
[]
for
s
in
samples
[
0
]:
for
s
in
samples
[
0
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment