Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
72fb0d5c
Commit
72fb0d5c
authored
Apr 06, 2020
by
Neel Kant
Browse files
Complete implementation of InverseClozeDataset with IndexedDataset
parent
2f6d2a3a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
125 deletions
+27
-125
megatron/data/ict_dataset.py
megatron/data/ict_dataset.py
+27
-125
No files found.
megatron/data/ict_dataset.py
View file @
72fb0d5c
import
random
import
random
import
os
import
time
import
numpy
as
np
import
numpy
as
np
import
torch
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
from
megatron
import
get_tokenizer
from
megatron
import
get_tokenizer
from
megatron
import
print_rank_0
from
megatron
import
mpu
from
megatron.data
import
helpers
class
InverseClozeDataset
(
Dataset
):
class
InverseClozeDataset
(
Dataset
):
"""Dataset containing sentences and various 'blocks' for an inverse cloze task."""
"""Dataset containing sentences and various 'blocks' for an inverse cloze task."""
...
@@ -14,17 +19,8 @@ class InverseClozeDataset(Dataset):
...
@@ -14,17 +19,8 @@ class InverseClozeDataset(Dataset):
self
.
name
=
name
self
.
name
=
name
self
.
seed
=
seed
self
.
seed
=
seed
self
.
max_seq_length
=
max_seq_length
self
.
max_seq_length
=
max_seq_length
self
.
indexed_dataset
=
indexed_dataset
self
.
indexed_dataset
=
indexed_dataset
self
.
short_seq_prob
=
short_seq_prob
self
.
samples_mapping
=
get_samples_mapping
(
self
.
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
self
.
max_seq_length
,
short_seq_prob
,
self
.
seed
,
self
.
name
)
tokenizer
=
get_tokenizer
()
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
...
@@ -35,11 +31,11 @@ class InverseClozeDataset(Dataset):
...
@@ -35,11 +31,11 @@ class InverseClozeDataset(Dataset):
self
.
pad_id
=
tokenizer
.
pad
self
.
pad_id
=
tokenizer
.
pad
def
__len__
(
self
):
def
__len__
(
self
):
return
self
.
samples_mapping
.
shape
[
0
]
return
self
.
indexed_dataset
.
doc_idx
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
# get rng state corresponding to index (allows deterministic random pair)
# get rng state corresponding to index (allows deterministic random pair)
rng
=
random
.
Random
(
idx
+
1000
)
rng
=
random
.
Random
(
idx
+
self
.
seed
)
np_rng
=
np
.
random
.
RandomState
(
seed
=
[
rng
.
randint
(
0
,
2
**
32
-
1
)
for
_
in
range
(
16
)])
np_rng
=
np
.
random
.
RandomState
(
seed
=
[
rng
.
randint
(
0
,
2
**
32
-
1
)
for
_
in
range
(
16
)])
# get seq length. Save 2 tokens for beginning and end
# get seq length. Save 2 tokens for beginning and end
...
@@ -64,29 +60,23 @@ class InverseClozeDataset(Dataset):
...
@@ -64,29 +60,23 @@ class InverseClozeDataset(Dataset):
def
get_sentence_split_doc
(
self
,
idx
):
def
get_sentence_split_doc
(
self
,
idx
):
"""fetch document at index idx and split into sentences"""
"""fetch document at index idx and split into sentences"""
document
=
self
.
indexed_dataset
[
idx
]
doc_start
=
self
.
indexed_dataset
.
doc_idx
[
idx
]
if
isinstance
(
document
,
dict
):
doc_end
=
self
.
indexed_dataset
.
doc_idx
[
idx
+
1
]
document
=
document
[
'text'
]
lines
=
document
.
split
(
'
\n
'
)
doc_sentences_array
=
self
.
indexed_dataset
[
doc_start
:
doc_end
]
return
[
line
for
line
in
lines
if
line
]
doc_sentences
=
[
list
(
arr
)
for
arr
in
doc_sentences_array
]
def
sentence_tokenize
(
self
,
sent
,
sentence_num
=
0
):
return
doc_sentences
"""tokenize sentence and get token types"""
tokens
=
self
.
tokenizer
.
EncodeAsIds
(
sent
).
tokenization
def
concat_and_pad_tokens
(
self
,
tokens
):
str_type
=
'str'
+
str
(
sentence_num
)
token_types
=
[
self
.
tokenizer
.
get_type
(
str_type
).
Id
]
*
len
(
tokens
)
return
tokens
,
token_types
def
concat_and_pad_tokens
(
self
,
tokens
,
token_types
):
"""concat with special tokens and pad sequence to self.max_seq_length"""
"""concat with special tokens and pad sequence to self.max_seq_length"""
tokens
=
[
self
.
cls_id
]
+
tokens
+
[
self
.
sep_id
]
tokens
=
[
self
.
cls_id
]
+
tokens
+
[
self
.
sep_id
]
token_types
=
[
token_types
[
0
]]
+
token_types
+
[
token_types
[
0
]]
assert
len
(
tokens
)
<=
self
.
max_seq_length
assert
len
(
tokens
)
<=
self
.
max_seq_length
num_pad
=
max
(
0
,
self
.
max_seq_length
-
len
(
tokens
))
num_pad
=
self
.
max_seq_length
-
len
(
tokens
)
pad_mask
=
[
0
]
*
len
(
tokens
)
+
[
1
]
*
num_pad
pad_mask
=
[
0
]
*
len
(
tokens
)
+
[
1
]
*
num_pad
tokens
+=
[
self
.
pad_id
]
*
num_pad
tokens
+=
[
self
.
pad_id
]
*
num_pad
token_types
+
=
[
token_types
[
0
]]
*
num_pad
token_types
=
[
0
]
*
self
.
max_seq_length
return
tokens
,
token_types
,
pad_mask
return
tokens
,
token_types
,
pad_mask
def
get_input_and_context
(
self
,
target_seq_length
,
rng
,
np_rng
):
def
get_input_and_context
(
self
,
target_seq_length
,
rng
,
np_rng
):
...
@@ -102,26 +92,22 @@ class InverseClozeDataset(Dataset):
...
@@ -102,26 +92,22 @@ class InverseClozeDataset(Dataset):
if
not
doc
:
if
not
doc
:
doc
=
None
doc
=
None
# set up and tokenize the entire selected document
num_sentences
=
len
(
doc
)
num_sentences
=
len
(
doc
)
padless_max_len
=
self
.
max_seq_length
-
2
padless_max_len
=
self
.
max_seq_length
-
2
# select a random sentence from the document as input
# select a random sentence from the document as input
# TODO: consider adding multiple input sentences.
# TODO: consider adding multiple input sentences.
input_sentence_idx
=
rng
.
randint
(
0
,
num_sentences
-
1
)
input_sentence_idx
=
rng
.
randint
(
0
,
num_sentences
-
1
)
tokens
,
token_types
=
self
.
sentence_tokenize
(
doc
[
input_sentence_idx
],
0
)
input_tokens
=
doc
[
input_sentence_idx
][:
target_seq_length
]
input_tokens
,
input_token_types
=
tokens
[:
target_seq_length
],
token_types
[:
target_seq_length
]
if
not
len
(
input_tokens
)
>
0
:
if
not
len
(
input_tokens
)
>
0
:
continue
continue
context_tokens
,
context_token_types
=
[]
,
[]
context_tokens
=
[]
# 10% of the time, the input sentence is left in the context.
# 10% of the time, the input sentence is left in the context.
# The other 90% of the time, keep it out.
# The other 90% of the time, keep it out.
if
rng
.
random
()
<
0.1
:
if
rng
.
random
()
<
0.1
:
context_tokens
=
input_tokens
.
copy
()
context_tokens
=
input_tokens
.
copy
()
context_token_types
=
input_token_types
.
copy
()
# parameters for examining sentences to add to the context
view_preceding
=
True
view_preceding
=
True
view_radius
=
1
view_radius
=
1
while
len
(
context_tokens
)
<
padless_max_len
:
while
len
(
context_tokens
)
<
padless_max_len
:
...
@@ -129,15 +115,13 @@ class InverseClozeDataset(Dataset):
...
@@ -129,15 +115,13 @@ class InverseClozeDataset(Dataset):
if
view_preceding
:
if
view_preceding
:
examine_idx
=
input_sentence_idx
-
view_radius
examine_idx
=
input_sentence_idx
-
view_radius
if
examine_idx
>=
0
:
if
examine_idx
>=
0
:
new_tokens
,
new_token_types
=
self
.
sentence_tokenize
(
doc
[
examine_idx
]
,
0
)
new_tokens
=
doc
[
examine_idx
]
context_tokens
=
new_tokens
+
context_tokens
context_tokens
=
new_tokens
+
context_tokens
context_token_types
=
new_token_types
+
context_token_types
else
:
else
:
examine_idx
=
input_sentence_idx
+
view_radius
examine_idx
=
input_sentence_idx
+
view_radius
if
examine_idx
<
num_sentences
:
if
examine_idx
<
num_sentences
:
new_tokens
,
new_token_types
=
self
.
sentence_tokenize
(
doc
[
examine_idx
]
,
0
)
new_tokens
=
doc
[
examine_idx
]
context_tokens
+=
new_tokens
context_tokens
+=
new_tokens
context_token_types
+=
new_token_types
view_radius
+=
1
view_radius
+=
1
view_preceding
=
not
view_preceding
view_preceding
=
not
view_preceding
if
view_radius
>
num_sentences
:
if
view_radius
>
num_sentences
:
...
@@ -145,15 +129,12 @@ class InverseClozeDataset(Dataset):
...
@@ -145,15 +129,12 @@ class InverseClozeDataset(Dataset):
# assemble the tokens and token types of the context
# assemble the tokens and token types of the context
context_tokens
=
context_tokens
[:
padless_max_len
]
context_tokens
=
context_tokens
[:
padless_max_len
]
context_token_types
=
context_token_types
[:
padless_max_len
]
if
not
len
(
context_tokens
)
>
0
:
if
not
len
(
context_tokens
)
>
0
:
continue
continue
# concatenate 'CLS' and 'SEP' tokens and add extra token types
# concatenate 'CLS' and 'SEP' tokens and add extra token types
input_tokens
,
input_token_types
,
input_pad_mask
=
self
.
concat_and_pad_tokens
(
input_tokens
,
input_token_types
,
input_pad_mask
=
self
.
concat_and_pad_tokens
(
input_tokens
)
input_tokens
,
input_token_types
)
context_tokens
,
context_token_types
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
context_tokens
)
context_tokens
,
context_token_types
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
context_tokens
,
context_token_types
)
return
(
input_tokens
,
input_token_types
,
input_pad_mask
),
\
return
(
input_tokens
,
input_token_types
,
input_pad_mask
),
\
(
context_tokens
,
context_token_types
,
context_pad_mask
)
(
context_tokens
,
context_token_types
,
context_pad_mask
)
...
@@ -161,82 +142,3 @@ class InverseClozeDataset(Dataset):
...
@@ -161,82 +142,3 @@ class InverseClozeDataset(Dataset):
raise
RuntimeError
(
"Could not get a valid data point from InverseClozeDataset"
)
raise
RuntimeError
(
"Could not get a valid data point from InverseClozeDataset"
)
def
get_samples_mapping
(
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
name
):
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{:0.2f}ssp'
.
format
(
short_seq_prob
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
indexed_dataset
.
doc_idx
.
dtype
==
np
.
int64
assert
indexed_dataset
.
sizes
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building sapmles index mapping for {} ...'
.
format
(
name
))
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
doc_idx
,
indexed_dataset
.
sizes
,
num_epochs
,
max_num_samples
,
max_seq_length
-
3
,
# account for added tokens
short_seq_prob
,
seed
,
verbose
)
print_rank_0
(
' > done building sapmles index maping'
)
np
.
save
(
indexmap_filename
,
samples_mapping
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elasped time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
samples_mapping
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
samples_mapping
.
shape
[
0
]))
return
samples_mapping
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment