Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
cf0100cf
Commit
cf0100cf
authored
Apr 23, 2020
by
Neel Kant
Browse files
Restructure BertDataset to help with RealmDataset
parent
f332d7e1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
97 deletions
+57
-97
megatron/data/bert_dataset.py
megatron/data/bert_dataset.py
+10
-12
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+0
-36
megatron/data/realm_dataset.py
megatron/data/realm_dataset.py
+47
-49
No files found.
megatron/data/bert_dataset.py
View file @
cf0100cf
...
...
@@ -152,6 +152,7 @@ class BertDataset(Dataset):
self
.
sep_id
=
tokenizer
.
sep
self
.
mask_id
=
tokenizer
.
mask
self
.
pad_id
=
tokenizer
.
pad
self
.
build_sample_fn
=
build_training_sample
def
__len__
(
self
):
...
...
@@ -159,21 +160,18 @@ class BertDataset(Dataset):
def
__getitem__
(
self
,
idx
):
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
start_idx
,
end_idx
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[
self
.
indexed_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_
training_
sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
)
return
self
.
build_sample
_fn
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
...
...
megatron/data/dataset_utils.py
View file @
cf0100cf
...
...
@@ -82,33 +82,6 @@ def build_training_sample(sample,
return
train_sample
def
build_simple_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
np_rng
):
tokens
=
list
(
itertools
.
chain
(
*
sample
))[:
max_seq_length
-
2
]
tokens
,
tokentypes
=
create_single_tokens_and_tokentypes
(
tokens
)
max_predictions_per_seq
=
masked_lm_prob
*
max_seq_length
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
)
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
=
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
)
train_sample
=
{
'text'
:
tokens_np
,
'types'
:
tokentypes_np
,
'labels'
:
labels_np
,
'loss_mask'
:
loss_mask_np
,
'padding_mask'
:
padding_mask_np
}
return
train_sample
def
get_a_and_b_segments
(
sample
,
np_rng
):
"""Divide sample into a and b segments."""
...
...
@@ -188,15 +161,6 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
return
tokens
,
tokentypes
def
create_single_tokens_and_tokentypes
(
_tokens
,
cls_id
,
sep_id
):
tokens
=
[]
tokens
.
append
(
cls_id
)
tokens
.
extend
(
list
(
_tokens
))
tokens
.
append
(
sep_id
)
tokentypes
=
[
0
]
*
len
(
tokens
)
return
tokens
,
tokentypes
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
[
"index"
,
"label"
])
...
...
megatron/data/realm_dataset.py
View file @
cf0100cf
import
itertools
import
numpy
as
np
import
spacy
from
torch.utils.data
import
Dataset
from
megatron
import
get_tokenizer
from
megatron.data.bert_dataset
import
get_samples_mapping_
from
megatron.data.dataset_utils
import
build_simple_training_sample
from
megatron.data.bert_dataset
import
BertDataset
,
get_samples_mapping_
from
megatron.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
qa_nlp
=
spacy
.
load
(
'en_core_web_lg'
)
class
RealmDataset
(
Dataset
):
class
RealmDataset
(
BertDataset
):
"""Dataset containing simple masked sentences for masked language modeling.
The dataset should yield sentences just like the regular BertDataset
...
...
@@ -21,52 +23,48 @@ class RealmDataset(Dataset):
def
__init__
(
self
,
name
,
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
):
super
(
RealmDataset
,
self
).
__init__
(
name
,
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
)
self
.
build_sample_fn
=
build_simple_training_sample
def
build_simple_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
np_rng
):
tokens
=
list
(
itertools
.
chain
(
*
sample
))[:
max_seq_length
-
2
]
tokens
,
tokentypes
=
create_single_tokens_and_tokentypes
(
tokens
,
cls_id
,
sep_id
)
max_predictions_per_seq
=
masked_lm_prob
*
max_seq_length
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
)
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
=
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
)
# REALM true sequence length is twice as long but none of that is to be predicted with LM
loss_mask_np
=
np
.
concatenate
((
loss_mask_np
,
np
.
ones
(
loss_mask_np
.
shape
)),
-
1
)
train_sample
=
{
'tokens'
:
tokens_np
,
'labels'
:
labels_np
,
'loss_mask'
:
loss_mask_np
,
'pad_mask'
:
padding_mask_np
}
return
train_sample
# Params to store.
self
.
name
=
name
self
.
seed
=
seed
self
.
masked_lm_prob
=
masked_lm_prob
self
.
max_seq_length
=
max_seq_length
# Dataset.
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
self
.
samples_mapping
=
get_samples_mapping_
(
self
.
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
self
.
max_seq_length
,
short_seq_prob
,
self
.
seed
,
self
.
name
)
# Vocab stuff.
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_dict
=
tokenizer
.
inv_vocab
self
.
cls_id
=
tokenizer
.
cls
self
.
sep_id
=
tokenizer
.
sep
self
.
mask_id
=
tokenizer
.
mask
self
.
pad_id
=
tokenizer
.
pad
def
__len__
(
self
):
return
self
.
samples_mapping
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
start_idx
,
end_idx
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[
self
.
indexed_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_simple_training_sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
)
def
create_single_tokens_and_tokentypes
(
_tokens
,
cls_id
,
sep_id
):
tokens
=
[]
tokens
.
append
(
cls_id
)
tokens
.
extend
(
list
(
_tokens
))
tokens
.
append
(
sep_id
)
tokentypes
=
[
0
]
*
len
(
tokens
)
return
tokens
,
tokentypes
def
spacy_ner
(
block_text
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment