Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
e2add0fd
Commit
e2add0fd
authored
Nov 24, 2019
by
Mohammad Shoeybi
Browse files
resolved reproducibity issue
parent
0601702a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
26 deletions
+26
-26
megatron/data/albert_dataset.py
megatron/data/albert_dataset.py
+4
-4
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+22
-22
No files found.
megatron/data/albert_dataset.py
View file @
e2add0fd
...
@@ -137,9 +137,6 @@ class AlbertDataset(Dataset):
...
@@ -137,9 +137,6 @@ class AlbertDataset(Dataset):
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
# Note that this rng state should be python and not numpy since
# python randint is inclusive whereas the numpy one is exclusive.
rng
=
random
.
Random
(
self
.
seed
+
idx
)
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
for
index
in
range
(
start_index
,
end_index
):
...
@@ -149,13 +146,16 @@ class AlbertDataset(Dataset):
...
@@ -149,13 +146,16 @@ class AlbertDataset(Dataset):
if len(s) > 1000:
if len(s) > 1000:
print(self.tokenizer.convert_ids_to_tokens(s))
print(self.tokenizer.convert_ids_to_tokens(s))
'''
'''
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_training_sample
(
sample
,
seq_length
,
return
build_training_sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
max_seq_length
,
# needed for padding
self
.
vocab_id_list
,
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
rng
)
self
.
masked_lm_prob
,
np_
rng
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
...
...
megatron/data/dataset_utils.py
View file @
e2add0fd
...
@@ -9,7 +9,7 @@ def build_training_sample(sample,
...
@@ -9,7 +9,7 @@ def build_training_sample(sample,
target_seq_length
,
max_seq_length
,
target_seq_length
,
max_seq_length
,
vocab_id_list
,
vocab_id_to_token_dict
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
rng
):
masked_lm_prob
,
np_
rng
):
"""Biuld training sample.
"""Biuld training sample.
Arguments:
Arguments:
...
@@ -24,8 +24,8 @@ def build_training_sample(sample,
...
@@ -24,8 +24,8 @@ def build_training_sample(sample,
mask_id: Mask token id.
mask_id: Mask token id.
pad_id: Padding token id.
pad_id: Padding token id.
masked_lm_prob: Probability to mask tokens.
masked_lm_prob: Probability to mask tokens.
rng: Random number genenrator. Note that this rng state should be
np_
rng: Random number genenrator. Note that this rng state should be
python
and not
numpy
since python randint is inclusive for
numpy
and not
python
since python randint is inclusive for
the opper bound whereas the numpy one is exclusive.
the opper bound whereas the numpy one is exclusive.
"""
"""
...
@@ -34,12 +34,12 @@ def build_training_sample(sample,
...
@@ -34,12 +34,12 @@ def build_training_sample(sample,
assert
target_seq_length
<=
max_seq_length
assert
target_seq_length
<=
max_seq_length
# Divide sample into two segments (A and B).
# Divide sample into two segments (A and B).
tokens_a
,
tokens_b
,
is_next_random
=
get_a_and_b_segments
(
sample
,
rng
)
tokens_a
,
tokens_b
,
is_next_random
=
get_a_and_b_segments
(
sample
,
np_
rng
)
# Truncate to `target_sequence_length`.
# Truncate to `target_sequence_length`.
max_num_tokens
=
target_seq_length
max_num_tokens
=
target_seq_length
truncated
=
truncate_segments
(
tokens_a
,
tokens_b
,
len
(
tokens_a
),
truncated
=
truncate_segments
(
tokens_a
,
tokens_b
,
len
(
tokens_a
),
len
(
tokens_b
),
max_num_tokens
,
rng
)
len
(
tokens_b
),
max_num_tokens
,
np_
rng
)
# Build tokens and toketypes.
# Build tokens and toketypes.
tokens
,
tokentypes
=
create_tokens_and_tokentypes
(
tokens_a
,
tokens_b
,
tokens
,
tokentypes
=
create_tokens_and_tokentypes
(
tokens_a
,
tokens_b
,
...
@@ -49,7 +49,7 @@ def build_training_sample(sample,
...
@@ -49,7 +49,7 @@ def build_training_sample(sample,
max_predictions_per_seq
=
masked_lm_prob
*
max_num_tokens
max_predictions_per_seq
=
masked_lm_prob
*
max_num_tokens
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
(
tokens
,
masked_positions
,
masked_labels
,
_
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
rng
)
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_
rng
)
# Padding.
# Padding.
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
\
...
@@ -67,7 +67,7 @@ def build_training_sample(sample,
...
@@ -67,7 +67,7 @@ def build_training_sample(sample,
return
train_sample
return
train_sample
def
get_a_and_b_segments
(
sample
,
rng
):
def
get_a_and_b_segments
(
sample
,
np_
rng
):
"""Divide sample into a and b segments."""
"""Divide sample into a and b segments."""
# Number of sentences in the sample.
# Number of sentences in the sample.
...
@@ -79,8 +79,8 @@ def get_a_and_b_segments(sample, rng):
...
@@ -79,8 +79,8 @@ def get_a_and_b_segments(sample, rng):
# `a_end` is how many sentences go into the `A`.
# `a_end` is how many sentences go into the `A`.
a_end
=
1
a_end
=
1
if
n_sentences
>=
3
:
if
n_sentences
>=
3
:
# Note that randin in
python
is
in
clusive.
# Note that randin in
numpy
is
ex
clusive.
a_end
=
rng
.
randint
(
1
,
n_sentences
-
1
)
a_end
=
np_
rng
.
randint
(
1
,
n_sentences
)
tokens_a
=
[]
tokens_a
=
[]
for
j
in
range
(
a_end
):
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
sample
[
j
])
tokens_a
.
extend
(
sample
[
j
])
...
@@ -92,14 +92,14 @@ def get_a_and_b_segments(sample, rng):
...
@@ -92,14 +92,14 @@ def get_a_and_b_segments(sample, rng):
# Random next:
# Random next:
is_next_random
=
False
is_next_random
=
False
if
rng
.
random
()
<
0.5
:
if
np_
rng
.
random
()
<
0.5
:
is_next_random
=
True
is_next_random
=
True
tokens_a
,
tokens_b
=
tokens_b
,
tokens_a
tokens_a
,
tokens_b
=
tokens_b
,
tokens_a
return
tokens_a
,
tokens_b
,
is_next_random
return
tokens_a
,
tokens_b
,
is_next_random
def
truncate_segments
(
tokens_a
,
tokens_b
,
len_a
,
len_b
,
max_num_tokens
,
rng
):
def
truncate_segments
(
tokens_a
,
tokens_b
,
len_a
,
len_b
,
max_num_tokens
,
np_
rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
"""Truncates a pair of sequences to a maximum sequence length."""
#print(len_a, len_b, max_num_tokens)
#print(len_a, len_b, max_num_tokens)
assert
len_a
>
0
assert
len_a
>
0
...
@@ -113,7 +113,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
...
@@ -113,7 +113,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
else
:
else
:
len_b
-=
1
len_b
-=
1
tokens
=
tokens_b
tokens
=
tokens_b
if
rng
.
random
()
<
0.5
:
if
np_
rng
.
random
()
<
0.5
:
del
tokens
[
0
]
del
tokens
[
0
]
else
:
else
:
tokens
.
pop
()
tokens
.
pop
()
...
@@ -163,7 +163,7 @@ def create_masked_lm_predictions(tokens,
...
@@ -163,7 +163,7 @@ def create_masked_lm_predictions(tokens,
masked_lm_prob
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
max_predictions_per_seq
,
rng
,
np_
rng
,
max_ngrams
=
3
,
max_ngrams
=
3
,
do_whole_word_mask
=
True
,
do_whole_word_mask
=
True
,
favor_longer_ngram
=
False
,
favor_longer_ngram
=
False
,
...
@@ -223,7 +223,7 @@ def create_masked_lm_predictions(tokens,
...
@@ -223,7 +223,7 @@ def create_masked_lm_predictions(tokens,
ngram_index
.
append
(
cand_indexes
[
idx
:
idx
+
n
])
ngram_index
.
append
(
cand_indexes
[
idx
:
idx
+
n
])
ngram_indexes
.
append
(
ngram_index
)
ngram_indexes
.
append
(
ngram_index
)
rng
.
shuffle
(
ngram_indexes
)
np_
rng
.
shuffle
(
ngram_indexes
)
masked_lms
=
[]
masked_lms
=
[]
covered_indexes
=
set
()
covered_indexes
=
set
()
...
@@ -239,9 +239,9 @@ def create_masked_lm_predictions(tokens,
...
@@ -239,9 +239,9 @@ def create_masked_lm_predictions(tokens,
if
index
in
covered_indexes
:
if
index
in
covered_indexes
:
continue
continue
n
=
np
.
random
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
n
=
np
_rng
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
p
=
pvals
[:
len
(
cand_index_set
)]
/
p
=
pvals
[:
len
(
cand_index_set
)]
/
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
n
-=
1
# Note(mingdachen):
# Note(mingdachen):
...
@@ -268,22 +268,22 @@ def create_masked_lm_predictions(tokens,
...
@@ -268,22 +268,22 @@ def create_masked_lm_predictions(tokens,
masked_token
=
None
masked_token
=
None
# 80% of the time, replace with [MASK]
# 80% of the time, replace with [MASK]
if
rng
.
random
()
<
0.8
:
if
np_
rng
.
random
()
<
0.8
:
masked_token
=
mask_id
masked_token
=
mask_id
else
:
else
:
# 10% of the time, keep original
# 10% of the time, keep original
if
rng
.
random
()
<
0.5
:
if
np_
rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
# 10% of the time, replace with random word
else
:
else
:
masked_token
=
vocab_id_list
[
rng
.
randint
(
0
,
len
(
vocab_id_list
)
-
1
)]
masked_token
=
vocab_id_list
[
np_
rng
.
randint
(
0
,
len
(
vocab_id_list
))]
output_tokens
[
index
]
=
masked_token
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
MaskedLmInstance
(
index
=
index
,
label
=
tokens
[
index
]))
masked_lms
.
append
(
MaskedLmInstance
(
index
=
index
,
label
=
tokens
[
index
]))
assert
len
(
masked_lms
)
<=
num_to_predict
assert
len
(
masked_lms
)
<=
num_to_predict
rng
.
shuffle
(
ngram_indexes
)
np_
rng
.
shuffle
(
ngram_indexes
)
select_indexes
=
set
()
select_indexes
=
set
()
if
do_permutation
:
if
do_permutation
:
...
@@ -327,7 +327,7 @@ def create_masked_lm_predictions(tokens,
...
@@ -327,7 +327,7 @@ def create_masked_lm_predictions(tokens,
select_indexes
=
sorted
(
select_indexes
)
select_indexes
=
sorted
(
select_indexes
)
permute_indexes
=
list
(
select_indexes
)
permute_indexes
=
list
(
select_indexes
)
rng
.
shuffle
(
permute_indexes
)
np_
rng
.
shuffle
(
permute_indexes
)
orig_token
=
list
(
output_tokens
)
orig_token
=
list
(
output_tokens
)
for
src_i
,
tgt_i
in
zip
(
select_indexes
,
permute_indexes
):
for
src_i
,
tgt_i
in
zip
(
select_indexes
,
permute_indexes
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment