Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
cf0100cf
Commit
cf0100cf
authored
Apr 23, 2020
by
Neel Kant
Browse files
Restructure BertDataset to help with RealmDataset
parent
f332d7e1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
97 deletions
+57
-97
megatron/data/bert_dataset.py
megatron/data/bert_dataset.py
+10
-12
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+0
-36
megatron/data/realm_dataset.py
megatron/data/realm_dataset.py
+47
-49
No files found.
megatron/data/bert_dataset.py
View file @
cf0100cf
...
@@ -152,6 +152,7 @@ class BertDataset(Dataset):
...
@@ -152,6 +152,7 @@ class BertDataset(Dataset):
self
.
sep_id
=
tokenizer
.
sep
self
.
sep_id
=
tokenizer
.
sep
self
.
mask_id
=
tokenizer
.
mask
self
.
mask_id
=
tokenizer
.
mask
self
.
pad_id
=
tokenizer
.
pad
self
.
pad_id
=
tokenizer
.
pad
self
.
build_sample_fn
=
build_training_sample
def
__len__
(
self
):
def
__len__
(
self
):
...
@@ -159,21 +160,18 @@ class BertDataset(Dataset):
...
@@ -159,21 +160,18 @@ class BertDataset(Dataset):
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
start_idx
,
end_idx
,
seq_length
=
self
.
samples_mapping
[
idx
]
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[
self
.
indexed_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
# Note that this rng state should be numpy and not python since
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_
training_
sample
(
sample
,
seq_length
,
return
self
.
build_sample
_fn
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
)
self
.
masked_lm_prob
,
np_rng
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
...
...
megatron/data/dataset_utils.py
View file @
cf0100cf
...
@@ -82,33 +82,6 @@ def build_training_sample(sample,
...
@@ -82,33 +82,6 @@ def build_training_sample(sample,
return
train_sample
return
train_sample
def
build_simple_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
np_rng
):
tokens
=
list
(
itertools
.
chain
(
*
sample
))[:
max_seq_length
-
2
]
tokens
,
tokentypes
=
create_single_tokens_and_tokentypes
(
tokens
)
max_predictions_per_seq
=
masked_lm_prob
*
max_seq_length
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
)
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
=
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
)
train_sample
=
{
'text'
:
tokens_np
,
'types'
:
tokentypes_np
,
'labels'
:
labels_np
,
'loss_mask'
:
loss_mask_np
,
'padding_mask'
:
padding_mask_np
}
return
train_sample
def
get_a_and_b_segments
(
sample
,
np_rng
):
def
get_a_and_b_segments
(
sample
,
np_rng
):
"""Divide sample into a and b segments."""
"""Divide sample into a and b segments."""
...
@@ -188,15 +161,6 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
...
@@ -188,15 +161,6 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
return
tokens
,
tokentypes
return
tokens
,
tokentypes
def
create_single_tokens_and_tokentypes
(
_tokens
,
cls_id
,
sep_id
):
tokens
=
[]
tokens
.
append
(
cls_id
)
tokens
.
extend
(
list
(
_tokens
))
tokens
.
append
(
sep_id
)
tokentypes
=
[
0
]
*
len
(
tokens
)
return
tokens
,
tokentypes
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
[
"index"
,
"label"
])
[
"index"
,
"label"
])
...
...
megatron/data/realm_dataset.py
View file @
cf0100cf
import
itertools
import
numpy
as
np
import
numpy
as
np
import
spacy
import
spacy
from
torch.utils.data
import
Dataset
from
megatron
import
get_tokenizer
from
megatron
import
get_tokenizer
from
megatron.data.bert_dataset
import
get_samples_mapping_
from
megatron.data.bert_dataset
import
BertDataset
,
get_samples_mapping_
from
megatron.data.dataset_utils
import
build_simple_training_sample
from
megatron.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
qa_nlp
=
spacy
.
load
(
'en_core_web_lg'
)
qa_nlp
=
spacy
.
load
(
'en_core_web_lg'
)
class
RealmDataset
(
Dataset
):
class
RealmDataset
(
BertDataset
):
"""Dataset containing simple masked sentences for masked language modeling.
"""Dataset containing simple masked sentences for masked language modeling.
The dataset should yield sentences just like the regular BertDataset
The dataset should yield sentences just like the regular BertDataset
...
@@ -21,52 +23,48 @@ class RealmDataset(Dataset):
...
@@ -21,52 +23,48 @@ class RealmDataset(Dataset):
def
__init__
(
self
,
name
,
indexed_dataset
,
data_prefix
,
def
__init__
(
self
,
name
,
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
):
max_seq_length
,
short_seq_prob
,
seed
):
super
(
RealmDataset
,
self
).
__init__
(
name
,
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
)
self
.
build_sample_fn
=
build_simple_training_sample
def
build_simple_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
np_rng
):
tokens
=
list
(
itertools
.
chain
(
*
sample
))[:
max_seq_length
-
2
]
tokens
,
tokentypes
=
create_single_tokens_and_tokentypes
(
tokens
,
cls_id
,
sep_id
)
max_predictions_per_seq
=
masked_lm_prob
*
max_seq_length
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
)
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
=
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
)
# REALM true sequence length is twice as long but none of that is to be predicted with LM
loss_mask_np
=
np
.
concatenate
((
loss_mask_np
,
np
.
ones
(
loss_mask_np
.
shape
)),
-
1
)
train_sample
=
{
'tokens'
:
tokens_np
,
'labels'
:
labels_np
,
'loss_mask'
:
loss_mask_np
,
'pad_mask'
:
padding_mask_np
}
return
train_sample
# Params to store.
def
create_single_tokens_and_tokentypes
(
_tokens
,
cls_id
,
sep_id
):
self
.
name
=
name
tokens
=
[]
self
.
seed
=
seed
tokens
.
append
(
cls_id
)
self
.
masked_lm_prob
=
masked_lm_prob
tokens
.
extend
(
list
(
_tokens
))
self
.
max_seq_length
=
max_seq_length
tokens
.
append
(
sep_id
)
tokentypes
=
[
0
]
*
len
(
tokens
)
# Dataset.
return
tokens
,
tokentypes
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
self
.
samples_mapping
=
get_samples_mapping_
(
self
.
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
self
.
max_seq_length
,
short_seq_prob
,
self
.
seed
,
self
.
name
)
# Vocab stuff.
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_dict
=
tokenizer
.
inv_vocab
self
.
cls_id
=
tokenizer
.
cls
self
.
sep_id
=
tokenizer
.
sep
self
.
mask_id
=
tokenizer
.
mask
self
.
pad_id
=
tokenizer
.
pad
def
__len__
(
self
):
return
self
.
samples_mapping
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
start_idx
,
end_idx
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[
self
.
indexed_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_simple_training_sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
)
def
spacy_ner
(
block_text
):
def
spacy_ner
(
block_text
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment