Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
898fcb94
Commit
898fcb94
authored
Apr 08, 2020
by
Mohammad
Browse files
moved current gpt2 dataloader to old
parent
c3691006
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
716 additions
and
551 deletions
+716
-551
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+370
-110
megatron/data/new_gpt2_dataset.py
megatron/data/new_gpt2_dataset.py
+0
-396
megatron/data/old_gpt2_dataset.py
megatron/data/old_gpt2_dataset.py
+136
-0
pretrain_gpt2.py
pretrain_gpt2.py
+42
-45
pretrain_gpt2_old.py
pretrain_gpt2_old.py
+168
-0
No files found.
megatron/data/gpt2_dataset.py
View file @
898fcb94
...
...
@@ -13,124 +13,384 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 dataset."""
"""GPT2
style
dataset."""
import
json
import
os
import
numpy
as
np
import
time
import
numpy
as
np
import
torch
from
torch.utils.data
import
Dataset
from
megatron
import
print_rank_0
from
megatron
import
mpu
from
megatron.data.bert_dataset
import
get_train_valid_test_split_
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
class
GPT2Dataset
(
Dataset
):
def
__init__
(
self
,
data_path
,
sizes_filename
,
seq_length
,
initial_seed
,
max_epochs
=
100
):
# Input parameters.
self
.
data_path
=
data_path
self
.
sizes_filename
=
sizes_filename
self
.
seq_length
=
seq_length
self
.
initial_seed
=
initial_seed
self
.
max_epochs
=
max_epochs
# Shard stuff.
# Dictionary from shard nameto its size (number of element).
self
.
master_shard_size_dict
=
None
# Dictionary from shard name to modified size so it is
# divisible by self.seq_length.
self
.
shard_size_dict
=
None
# Long array (self.max_epochs * num-shards) populated
# randomly with shard names.
self
.
shards_name
=
None
# Start index of the data for a shard.
self
.
shards_start_index
=
None
self
.
build_shard_mappings_
()
self
.
data_length
=
self
.
shards_start_index
[
-
1
]
# Data.
self
.
shards_data
=
[
None
]
*
self
.
shards_name
.
size
self
.
shards_sample_index
=
[
None
]
*
self
.
shards_name
.
size
def
build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
)
total_num_of_documents
=
indexed_dataset
.
sizes
.
shape
[
0
]
splits
=
get_train_valid_test_split_
(
splits_string
,
total_num_of_documents
)
# Print stats about the splits.
print_rank_0
(
' > dataset split:'
)
def
print_split_stats
(
name
,
index
):
print_rank_0
(
' {}:'
.
format
(
name
))
print_rank_0
(
' document indices in [{}, {}) total of {} '
'documents'
.
format
(
splits
[
index
],
splits
[
index
+
1
],
splits
[
index
+
1
]
-
splits
[
index
]))
print_split_stats
(
'train'
,
0
)
print_split_stats
(
'validation'
,
1
)
print_split_stats
(
'test'
,
2
)
def
build_dataset
(
index
,
name
):
dataset
=
None
if
splits
[
index
+
1
]
>
splits
[
index
]:
documents
=
np
.
arange
(
start
=
splits
[
index
],
stop
=
splits
[
index
+
1
],
step
=
1
,
dtype
=
np
.
int32
)
dataset
=
GPT2Dataset
(
name
,
data_prefix
,
documents
,
indexed_dataset
,
train_valid_test_num_samples
[
index
],
seq_length
,
seed
)
return
dataset
train_dataset
=
build_dataset
(
0
,
'train'
)
valid_dataset
=
build_dataset
(
1
,
'valid'
)
test_dataset
=
build_dataset
(
2
,
'test'
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
"""Build indexed dataset."""
print_rank_0
(
' > building dataset index ...'
)
start_time
=
time
.
time
()
indexed_dataset
=
make_indexed_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print_rank_0
(
' > finished creating indexed dataset in {:4f} '
'seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' number of documents: {}'
.
format
(
indexed_dataset
.
sizes
.
shape
[
0
]))
return
indexed_dataset
class
GPT2Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
name
,
data_prefix
,
documents
,
indexed_dataset
,
num_samples
,
seq_length
,
seed
):
self
.
name
=
name
self
.
indexed_dataset
=
indexed_dataset
# Checks
assert
np
.
min
(
documents
)
>=
0
assert
np
.
max
(
documents
)
<
indexed_dataset
.
sizes
.
shape
[
0
]
# Build index mappings.
self
.
doc_idx
,
self
.
sample_idx
,
self
.
shuffle_idx
=
_build_index_mappings
(
self
.
name
,
data_prefix
,
documents
,
self
.
indexed_dataset
.
sizes
,
num_samples
,
seq_length
,
seed
)
def
__len__
(
self
):
return
self
.
data_length
return
self
.
sample_idx
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
# Find which shard we need.
shard_index
=
np
.
searchsorted
(
self
.
shards_start_index
,
idx
,
side
=
'right'
)
-
1
# data index in the shard.
data_idx
=
idx
-
self
.
shards_start_index
[
shard_index
]
# Load the shard if it is not in memory.
if
self
.
shards_data
[
shard_index
]
is
None
:
print
(
'global rank {} is building data for shard index {} ...'
.
format
(
torch
.
distributed
.
get_rank
(),
shard_index
))
self
.
build_dataset_
(
shard_index
)
#assert self.shards_data[shard_index] is not None
# Start index.
start_index
=
self
.
shards_sample_index
[
shard_index
][
data_idx
]
# Add one for label shift.
end_index
=
start_index
+
self
.
seq_length
+
1
data
=
self
.
shards_data
[
shard_index
][
start_index
:
end_index
]
return
{
'text'
:
np
.
array
(
data
,
dtype
=
np
.
int64
)}
def
build_dataset_
(
self
,
shard_index
):
# Garbage collect so we don't use a lot of memory.
# Leave the last one in case other threads have not catche up yet.
#for i in range(shard_index - 1):
for
i
in
range
(
shard_index
):
self
.
shards_data
[
i
]
=
None
self
.
shards_sample_index
[
i
]
=
None
# Read the shard.
filename
=
os
.
path
.
join
(
self
.
data_path
,
self
.
shards_name
[
shard_index
])
print
(
'loading {}'
.
format
(
filename
))
data
=
np
.
load
(
filename
,
allow_pickle
=
True
)
# Shuffle the data
rng
=
np
.
random
.
RandomState
(
self
.
initial_seed
+
shard_index
)
rng
.
shuffle
(
data
)
# Flatten.
data
=
np
.
hstack
(
data
)
size
=
(
data
.
shape
[
0
]
-
1
)
//
self
.
seq_length
last_index
=
size
*
self
.
seq_length
+
1
data
=
data
[
0
:
last_index
]
self
.
shards_data
[
shard_index
]
=
data
indices
=
np
.
arange
(
size
)
*
self
.
seq_length
rng
.
shuffle
(
indices
)
self
.
shards_sample_index
[
shard_index
]
=
indices
def
build_shard_mappings_
(
self
):
# Load the sizes file.
sizes_filename
=
os
.
path
.
join
(
self
.
data_path
,
self
.
sizes_filename
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' > loading sizes from {}'
.
format
(
sizes_filename
))
with
open
(
sizes_filename
,
'r'
)
as
f
:
self
.
master_shard_size_dict
=
json
.
load
(
f
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' found {} shards'
.
format
(
len
(
self
.
master_shard_size_dict
)))
# Adjust sizes to be a multiple of seq_length.
self
.
shard_size_dict
=
self
.
master_shard_size_dict
.
copy
()
total_samples
=
0
for
shard
in
self
.
shard_size_dict
:
size
=
self
.
shard_size_dict
[
shard
]
size
=
((
size
-
1
)
//
self
.
seq_length
)
*
self
.
seq_length
total_samples
+=
size
//
self
.
seq_length
self
.
shard_size_dict
[
shard
]
=
size
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' found {} samples in the dataset'
.
format
(
total_samples
))
# Build a list of shards.
shards_
=
np
.
sort
(
np
.
array
(
list
(
self
.
shard_size_dict
.
keys
())))
rng
=
np
.
random
.
RandomState
(
self
.
initial_seed
)
self
.
shards_name
=
np
.
copy
(
shards_
)
rng
.
shuffle
(
self
.
shards_name
)
for
i
in
range
(
1
,
self
.
max_epochs
):
shards_c
=
np
.
copy
(
shards_
)
rng
.
shuffle
(
shards_c
)
self
.
shards_name
=
np
.
append
(
self
.
shards_name
,
shards_c
)
# Build the global indexing.
self
.
shards_start_index
=
np
.
zeros
(
self
.
shards_name
.
size
,
dtype
=
np
.
int
)
self
.
shards_start_index
[
0
]
=
0
for
i
in
range
(
1
,
self
.
shards_name
.
size
):
shard
=
str
(
self
.
shards_name
[
i
-
1
])
size
=
self
.
shard_size_dict
[
shard
]
self
.
shards_start_index
[
i
]
=
self
.
shards_start_index
[
i
-
1
]
+
\
size
//
self
.
seq_length
# Get the shuffled index.
idx
=
self
.
shuffle_idx
[
idx
]
# Start and end documents and offsets.
doc_index_f
=
self
.
sample_idx
[
idx
][
0
]
doc_index_l
=
self
.
sample_idx
[
idx
+
1
][
0
]
offset_f
=
self
.
sample_idx
[
idx
][
1
]
offset_l
=
self
.
sample_idx
[
idx
+
1
][
1
]
# If we are within the same document, just extract the chunk.
if
doc_index_f
==
doc_index_l
:
sample
=
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
,
length
=
offset_l
-
offset_f
+
1
)
else
:
# Otherwise, get the rest of the initial document.
sample_list
=
[
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
)]
# Loop over all in between documents and add the entire document.
for
i
in
range
(
doc_index_f
+
1
,
doc_index_l
):
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
i
]))
# And finally add the relevant portion of last document.
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_l
],
length
=
offset_l
+
1
))
sample
=
np
.
concatenate
(
sample_list
)
return
{
'text'
:
np
.
array
(
sample
,
dtype
=
np
.
int64
)}
def
_build_index_mappings
(
name
,
data_prefix
,
documents
,
sizes
,
num_samples
,
seq_length
,
seed
):
"""doc-idx, sample-idx, and shuffle-idx."""
# Number of tokens in each epoch and number of required epochs.
tokens_per_epoch
=
_num_tokens
(
documents
,
sizes
)
num_epochs
=
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
)
# rng state
np_rng
=
np
.
random
.
RandomState
(
seed
=
seed
)
# Filename of the index mappings.
_filename
=
data_prefix
_filename
+=
'_{}_indexmap'
.
format
(
name
)
_filename
+=
'_{}ns'
.
format
(
num_samples
)
_filename
+=
'_{}sl'
.
format
(
seq_length
)
_filename
+=
'_{}s'
.
format
(
seed
)
doc_idx_filename
=
_filename
+
'_doc_idx.npy'
sample_idx_filename
=
_filename
+
'_sample_idx.npy'
shuffle_idx_filename
=
_filename
+
'_shuffle_idx.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
:
if
(
not
os
.
path
.
isfile
(
doc_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
sample_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
shuffle_idx_filename
)):
print_rank_0
(
' > WARNING: could not find index map files, building '
'the indices on rank 0 ...'
)
# doc-idx.
start_time
=
time
.
time
()
doc_idx
=
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
)
np
.
save
(
doc_idx_filename
,
doc_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save doc-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# sample-idx.
start_time
=
time
.
time
()
# Use C++ implementation for speed.
from
megatron.data
import
helpers
assert
doc_idx
.
dtype
==
np
.
int32
assert
sizes
.
dtype
==
np
.
int32
sample_idx
=
helpers
.
build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
)
#sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)
np
.
save
(
sample_idx_filename
,
sample_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save sample-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# shuffle-idx.
start_time
=
time
.
time
()
shuffle_idx
=
_build_shuffle_idx
(
sample_idx
.
shape
[
0
],
np_rng
)
np
.
save
(
shuffle_idx_filename
,
shuffle_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save shuffle-idx mapping'
' (seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load mappings.
start_time
=
time
.
time
()
print_rank_0
(
' > loading doc-idx mapping from {}'
.
format
(
doc_idx_filename
))
doc_idx
=
np
.
load
(
doc_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' > loading sample-idx mapping from {}'
.
format
(
sample_idx_filename
))
sample_idx
=
np
.
load
(
sample_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' > loading shuffle-idx mapping from {}'
.
format
(
shuffle_idx_filename
))
shuffle_idx
=
np
.
load
(
shuffle_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
sample_idx
.
shape
[
0
]))
print_rank_0
(
' total number of epochs: {}'
.
format
(
num_epochs
))
return
doc_idx
,
sample_idx
,
shuffle_idx
def
_num_tokens
(
documents
,
sizes
):
"""Total number of tokens in the dataset."""
return
np
.
sum
(
sizes
[
documents
])
def
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
):
"""Based on number of samples and sequence lenght, calculate how many
epochs will be needed."""
num_epochs
=
0
total_tokens
=
0
while
True
:
num_epochs
+=
1
total_tokens
+=
tokens_per_epoch
# -1 is because we need to retrieve seq_length + 1 token each time
# but the last token will overlap with the first token of the next
# sample except for the last sample.
if
((
total_tokens
-
1
)
//
seq_length
)
>=
num_samples
:
return
num_epochs
def
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
):
"""Build an array with length = number-of-epochs * number-of-dcuments.
Each index is mapped to a corresponding document."""
doc_idx
=
np
.
mgrid
[
0
:
num_epochs
,
0
:
len
(
documents
)][
1
]
doc_idx
[:]
=
documents
doc_idx
=
doc_idx
.
reshape
(
-
1
)
doc_idx
=
doc_idx
.
astype
(
np
.
int32
)
np_rng
.
shuffle
(
doc_idx
)
return
doc_idx
def
_build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
):
"""Sample index mapping is a 2D array with sizes
[number-of-samples + 1, 2] where [..., 0] contains
the index into `doc_idx` and [..., 0] is the
starting offset in that document."""
# Total number of samples. For -1 see comments in `_num_epochs`.
num_samples
=
(
num_epochs
*
tokens_per_epoch
-
1
)
//
seq_length
sample_idx
=
np
.
zeros
([
num_samples
+
1
,
2
],
dtype
=
np
.
int32
)
# Index into sample_idx.
sample_index
=
0
# Index into doc_idx.
doc_idx_index
=
0
# Begining offset for each document.
doc_offset
=
0
# Start with first document and no offset.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
while
sample_index
<=
num_samples
:
# Start with a fresh sequence.
remaining_seq_length
=
seq_length
+
1
while
remaining_seq_length
!=
0
:
# Get the document length.
doc_id
=
doc_idx
[
doc_idx_index
]
doc_length
=
sizes
[
doc_id
]
-
doc_offset
# And add it to the current sequence.
remaining_seq_length
-=
doc_length
# If we have more than a full sequence, adjust offset and set
# remaining length to zero so we return from the while loop.
# Note that -1 here is for the same reason we have -1 in
# `_num_epochs` calculations.
if
remaining_seq_length
<=
0
:
doc_offset
+=
(
remaining_seq_length
+
doc_length
-
1
)
remaining_seq_length
=
0
else
:
# Otherwise, start from the begining of the next document.
doc_idx_index
+=
1
doc_offset
=
0
# Record the sequence.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
return
sample_idx
def
_build_shuffle_idx
(
size
,
np_rng
):
"""Build the range [0, size) and shuffle."""
dtype_
=
np
.
uint32
if
size
>=
(
np
.
iinfo
(
np
.
uint32
).
max
-
1
):
dtype_
=
np
.
int64
shuffle_idx
=
np
.
arange
(
start
=
0
,
stop
=
size
,
step
=
1
,
dtype
=
dtype_
)
np_rng
.
shuffle
(
shuffle_idx
)
return
shuffle_idx
'''
class IndexedDataset:
def __init__(self, num_docs, min_doc_length, max_doc_length, seq_length):
self.seq_length = seq_length
assert min_doc_length > 0
self.tokens = []
self.sizes = np.zeros(num_docs, dtype=np.int32)
for i in range(num_docs):
size = np.random.randint(low=min_doc_length, high=max_doc_length,
size=1, dtype=np.uint32)[0]
tokens_ = np.random.randint(low=1, high=60000,
size=size, dtype=np.uint32)
tokens_[-1] = 0
self.sizes[i] = size
self.tokens.append(tokens_)
self.tokens_flat = None
def get(self, doc_idx, offset=None, length=None):
if length is None:
if offset is None:
return self.tokens[doc_idx]
else:
return self.tokens[doc_idx][offset:]
if offset is None:
return self.tokens[doc_idx][0:length]
return self.tokens[doc_idx][offset:(offset+length)]
def get_sample(self, index):
start = index * self.seq_length
end = start + self.seq_length + 1
return self.tokens_flat[start:end]
def build_tokens_flat(self, doc_idx):
self.tokens_flat = np.concatenate([self.tokens[i] for i in doc_idx])
def test(seed, data_prefix, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length):
print('testing for seed: {}, seq-length: {}, num-samples: {}, '
'num-docs: {}, min-doc-length: {}, max-doc-length: {}'.format(
seed, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length))
np.random.seed(seed)
indexed_dataset = IndexedDataset(num_docs, min_doc_length,
max_doc_length, seq_length)
indices = np.random.randint(indexed_dataset.sizes.shape[0]-2, size=2)
documents = np.arange(np.min(indices), np.max(indices)+1)
dataset = GPT2Dataset('gpt2', data_prefix, documents, indexed_dataset,
num_samples, seq_length, seed)
print(' > number of epochs:', dataset.num_epochs)
indexed_dataset.build_tokens_flat(dataset.doc_idx)
for idx in range(num_samples):
a = dataset[idx]
b = indexed_dataset.get_sample(idx)
assert np.sum(a - b) == 0
print('passed')
if __name__ == '__main__':
print('gpt2 dataset ...')
import random
data_prefix = 'junk/'
for seed in range(1234, 1245):
random.seed(seed)
num_docs = random.randint(1, 999)
min_doc_length = random.randint(1, 99)
max_doc_length = random.randint(100, 9999)
num_samples = random.randint(num_docs, 100*num_docs)
seq_length = random.randint(min_doc_length, max_doc_length)
test(seed, data_prefix, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length)
'''
megatron/data/new_gpt2_dataset.py
deleted
100644 → 0
View file @
c3691006
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 style dataset."""
import
os
import
time
import
numpy
as
np
import
torch
from
megatron
import
print_rank_0
from
megatron
import
mpu
from
megatron.data.bert_dataset
import
get_train_valid_test_split_
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
def
build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
)
total_num_of_documents
=
indexed_dataset
.
sizes
.
shape
[
0
]
splits
=
get_train_valid_test_split_
(
splits_string
,
total_num_of_documents
)
# Print stats about the splits.
print_rank_0
(
' > dataset split:'
)
def
print_split_stats
(
name
,
index
):
print_rank_0
(
' {}:'
.
format
(
name
))
print_rank_0
(
' document indices in [{}, {}) total of {} '
'documents'
.
format
(
splits
[
index
],
splits
[
index
+
1
],
splits
[
index
+
1
]
-
splits
[
index
]))
print_split_stats
(
'train'
,
0
)
print_split_stats
(
'validation'
,
1
)
print_split_stats
(
'test'
,
2
)
def
build_dataset
(
index
,
name
):
dataset
=
None
if
splits
[
index
+
1
]
>
splits
[
index
]:
documents
=
np
.
arange
(
start
=
splits
[
index
],
stop
=
splits
[
index
+
1
],
step
=
1
,
dtype
=
np
.
int32
)
dataset
=
GPT2Dataset
(
name
,
data_prefix
,
documents
,
indexed_dataset
,
train_valid_test_num_samples
[
index
],
seq_length
,
seed
)
return
dataset
train_dataset
=
build_dataset
(
0
,
'train'
)
valid_dataset
=
build_dataset
(
1
,
'valid'
)
test_dataset
=
build_dataset
(
2
,
'test'
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
"""Build indexed dataset."""
print_rank_0
(
' > building dataset index ...'
)
start_time
=
time
.
time
()
indexed_dataset
=
make_indexed_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print_rank_0
(
' > finished creating indexed dataset in {:4f} '
'seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' number of documents: {}'
.
format
(
indexed_dataset
.
sizes
.
shape
[
0
]))
return
indexed_dataset
class
GPT2Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
name
,
data_prefix
,
documents
,
indexed_dataset
,
num_samples
,
seq_length
,
seed
):
self
.
name
=
name
self
.
indexed_dataset
=
indexed_dataset
# Checks
assert
np
.
min
(
documents
)
>=
0
assert
np
.
max
(
documents
)
<
indexed_dataset
.
sizes
.
shape
[
0
]
# Build index mappings.
self
.
doc_idx
,
self
.
sample_idx
,
self
.
shuffle_idx
=
_build_index_mappings
(
self
.
name
,
data_prefix
,
documents
,
self
.
indexed_dataset
.
sizes
,
num_samples
,
seq_length
,
seed
)
def
__len__
(
self
):
return
self
.
sample_idx
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
# Get the shuffled index.
idx
=
self
.
shuffle_idx
[
idx
]
# Start and end documents and offsets.
doc_index_f
=
self
.
sample_idx
[
idx
][
0
]
doc_index_l
=
self
.
sample_idx
[
idx
+
1
][
0
]
offset_f
=
self
.
sample_idx
[
idx
][
1
]
offset_l
=
self
.
sample_idx
[
idx
+
1
][
1
]
# If we are within the same document, just extract the chunk.
if
doc_index_f
==
doc_index_l
:
sample
=
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
,
length
=
offset_l
-
offset_f
+
1
)
else
:
# Otherwise, get the rest of the initial document.
sample_list
=
[
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
)]
# Loop over all in between documents and add the entire document.
for
i
in
range
(
doc_index_f
+
1
,
doc_index_l
):
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
i
]))
# And finally add the relevant portion of last document.
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_l
],
length
=
offset_l
+
1
))
sample
=
np
.
concatenate
(
sample_list
)
return
{
'text'
:
np
.
array
(
sample
,
dtype
=
np
.
int64
)}
def
_build_index_mappings
(
name
,
data_prefix
,
documents
,
sizes
,
num_samples
,
seq_length
,
seed
):
"""doc-idx, sample-idx, and shuffle-idx."""
# Number of tokens in each epoch and number of required epochs.
tokens_per_epoch
=
_num_tokens
(
documents
,
sizes
)
num_epochs
=
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
)
# rng state
np_rng
=
np
.
random
.
RandomState
(
seed
=
seed
)
# Filename of the index mappings.
_filename
=
data_prefix
_filename
+=
'_{}_indexmap'
.
format
(
name
)
_filename
+=
'_{}ns'
.
format
(
num_samples
)
_filename
+=
'_{}sl'
.
format
(
seq_length
)
_filename
+=
'_{}s'
.
format
(
seed
)
doc_idx_filename
=
_filename
+
'_doc_idx.npy'
sample_idx_filename
=
_filename
+
'_sample_idx.npy'
shuffle_idx_filename
=
_filename
+
'_shuffle_idx.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
:
if
(
not
os
.
path
.
isfile
(
doc_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
sample_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
shuffle_idx_filename
)):
print_rank_0
(
' > WARNING: could not find index map files, building '
'the indices on rank 0 ...'
)
# doc-idx.
start_time
=
time
.
time
()
doc_idx
=
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
)
np
.
save
(
doc_idx_filename
,
doc_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save doc-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# sample-idx.
start_time
=
time
.
time
()
# Use C++ implementation for speed.
from
megatron.data
import
helpers
assert
doc_idx
.
dtype
==
np
.
int32
assert
sizes
.
dtype
==
np
.
int32
sample_idx
=
helpers
.
build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
)
#sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)
np
.
save
(
sample_idx_filename
,
sample_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save sample-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# shuffle-idx.
start_time
=
time
.
time
()
shuffle_idx
=
_build_shuffle_idx
(
sample_idx
.
shape
[
0
],
np_rng
)
np
.
save
(
shuffle_idx_filename
,
shuffle_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save shuffle-idx mapping'
' (seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load mappings.
start_time
=
time
.
time
()
print_rank_0
(
' > loading doc-idx mapping from {}'
.
format
(
doc_idx_filename
))
doc_idx
=
np
.
load
(
doc_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' > loading sample-idx mapping from {}'
.
format
(
sample_idx_filename
))
sample_idx
=
np
.
load
(
sample_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' > loading shuffle-idx mapping from {}'
.
format
(
shuffle_idx_filename
))
shuffle_idx
=
np
.
load
(
shuffle_idx_filename
,
allow_pickle
=
True
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
sample_idx
.
shape
[
0
]))
print_rank_0
(
' total number of epochs: {}'
.
format
(
num_epochs
))
return
doc_idx
,
sample_idx
,
shuffle_idx
def
_num_tokens
(
documents
,
sizes
):
"""Total number of tokens in the dataset."""
return
np
.
sum
(
sizes
[
documents
])
def
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
):
"""Based on number of samples and sequence lenght, calculate how many
epochs will be needed."""
num_epochs
=
0
total_tokens
=
0
while
True
:
num_epochs
+=
1
total_tokens
+=
tokens_per_epoch
# -1 is because we need to retrieve seq_length + 1 token each time
# but the last token will overlap with the first token of the next
# sample except for the last sample.
if
((
total_tokens
-
1
)
//
seq_length
)
>=
num_samples
:
return
num_epochs
def
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
):
"""Build an array with length = number-of-epochs * number-of-dcuments.
Each index is mapped to a corresponding document."""
doc_idx
=
np
.
mgrid
[
0
:
num_epochs
,
0
:
len
(
documents
)][
1
]
doc_idx
[:]
=
documents
doc_idx
=
doc_idx
.
reshape
(
-
1
)
doc_idx
=
doc_idx
.
astype
(
np
.
int32
)
np_rng
.
shuffle
(
doc_idx
)
return
doc_idx
def
_build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
):
"""Sample index mapping is a 2D array with sizes
[number-of-samples + 1, 2] where [..., 0] contains
the index into `doc_idx` and [..., 0] is the
starting offset in that document."""
# Total number of samples. For -1 see comments in `_num_epochs`.
num_samples
=
(
num_epochs
*
tokens_per_epoch
-
1
)
//
seq_length
sample_idx
=
np
.
zeros
([
num_samples
+
1
,
2
],
dtype
=
np
.
int32
)
# Index into sample_idx.
sample_index
=
0
# Index into doc_idx.
doc_idx_index
=
0
# Begining offset for each document.
doc_offset
=
0
# Start with first document and no offset.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
while
sample_index
<=
num_samples
:
# Start with a fresh sequence.
remaining_seq_length
=
seq_length
+
1
while
remaining_seq_length
!=
0
:
# Get the document length.
doc_id
=
doc_idx
[
doc_idx_index
]
doc_length
=
sizes
[
doc_id
]
-
doc_offset
# And add it to the current sequence.
remaining_seq_length
-=
doc_length
# If we have more than a full sequence, adjust offset and set
# remaining length to zero so we return from the while loop.
# Note that -1 here is for the same reason we have -1 in
# `_num_epochs` calculations.
if
remaining_seq_length
<=
0
:
doc_offset
+=
(
remaining_seq_length
+
doc_length
-
1
)
remaining_seq_length
=
0
else
:
# Otherwise, start from the begining of the next document.
doc_idx_index
+=
1
doc_offset
=
0
# Record the sequence.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
return
sample_idx
def
_build_shuffle_idx
(
size
,
np_rng
):
"""Build the range [0, size) and shuffle."""
dtype_
=
np
.
uint32
if
size
>=
(
np
.
iinfo
(
np
.
uint32
).
max
-
1
):
dtype_
=
np
.
int64
shuffle_idx
=
np
.
arange
(
start
=
0
,
stop
=
size
,
step
=
1
,
dtype
=
dtype_
)
np_rng
.
shuffle
(
shuffle_idx
)
return
shuffle_idx
'''
class IndexedDataset:
def __init__(self, num_docs, min_doc_length, max_doc_length, seq_length):
self.seq_length = seq_length
assert min_doc_length > 0
self.tokens = []
self.sizes = np.zeros(num_docs, dtype=np.int32)
for i in range(num_docs):
size = np.random.randint(low=min_doc_length, high=max_doc_length,
size=1, dtype=np.uint32)[0]
tokens_ = np.random.randint(low=1, high=60000,
size=size, dtype=np.uint32)
tokens_[-1] = 0
self.sizes[i] = size
self.tokens.append(tokens_)
self.tokens_flat = None
def get(self, doc_idx, offset=None, length=None):
if length is None:
if offset is None:
return self.tokens[doc_idx]
else:
return self.tokens[doc_idx][offset:]
if offset is None:
return self.tokens[doc_idx][0:length]
return self.tokens[doc_idx][offset:(offset+length)]
def get_sample(self, index):
start = index * self.seq_length
end = start + self.seq_length + 1
return self.tokens_flat[start:end]
def build_tokens_flat(self, doc_idx):
self.tokens_flat = np.concatenate([self.tokens[i] for i in doc_idx])
def test(seed, data_prefix, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length):
print('testing for seed: {}, seq-length: {}, num-samples: {}, '
'num-docs: {}, min-doc-length: {}, max-doc-length: {}'.format(
seed, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length))
np.random.seed(seed)
indexed_dataset = IndexedDataset(num_docs, min_doc_length,
max_doc_length, seq_length)
indices = np.random.randint(indexed_dataset.sizes.shape[0]-2, size=2)
documents = np.arange(np.min(indices), np.max(indices)+1)
dataset = GPT2Dataset('gpt2', data_prefix, documents, indexed_dataset,
num_samples, seq_length, seed)
print(' > number of epochs:', dataset.num_epochs)
indexed_dataset.build_tokens_flat(dataset.doc_idx)
for idx in range(num_samples):
a = dataset[idx]
b = indexed_dataset.get_sample(idx)
assert np.sum(a - b) == 0
print('passed')
if __name__ == '__main__':
print('gpt2 dataset ...')
import random
data_prefix = 'junk/'
for seed in range(1234, 1245):
random.seed(seed)
num_docs = random.randint(1, 999)
min_doc_length = random.randint(1, 99)
max_doc_length = random.randint(100, 9999)
num_samples = random.randint(num_docs, 100*num_docs)
seq_length = random.randint(min_doc_length, max_doc_length)
test(seed, data_prefix, seq_length, num_samples,
num_docs, min_doc_length, max_doc_length)
'''
megatron/data/old_gpt2_dataset.py
0 → 100644
View file @
898fcb94
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 dataset."""
import
json
import
os
import
numpy
as
np
import
torch
from
torch.utils.data
import
Dataset
class
GPT2Dataset
(
Dataset
):
def
__init__
(
self
,
data_path
,
sizes_filename
,
seq_length
,
initial_seed
,
max_epochs
=
100
):
# Input parameters.
self
.
data_path
=
data_path
self
.
sizes_filename
=
sizes_filename
self
.
seq_length
=
seq_length
self
.
initial_seed
=
initial_seed
self
.
max_epochs
=
max_epochs
# Shard stuff.
# Dictionary from shard nameto its size (number of element).
self
.
master_shard_size_dict
=
None
# Dictionary from shard name to modified size so it is
# divisible by self.seq_length.
self
.
shard_size_dict
=
None
# Long array (self.max_epochs * num-shards) populated
# randomly with shard names.
self
.
shards_name
=
None
# Start index of the data for a shard.
self
.
shards_start_index
=
None
self
.
build_shard_mappings_
()
self
.
data_length
=
self
.
shards_start_index
[
-
1
]
# Data.
self
.
shards_data
=
[
None
]
*
self
.
shards_name
.
size
self
.
shards_sample_index
=
[
None
]
*
self
.
shards_name
.
size
def
__len__
(
self
):
return
self
.
data_length
def
__getitem__
(
self
,
idx
):
# Find which shard we need.
shard_index
=
np
.
searchsorted
(
self
.
shards_start_index
,
idx
,
side
=
'right'
)
-
1
# data index in the shard.
data_idx
=
idx
-
self
.
shards_start_index
[
shard_index
]
# Load the shard if it is not in memory.
if
self
.
shards_data
[
shard_index
]
is
None
:
print
(
'global rank {} is building data for shard index {} ...'
.
format
(
torch
.
distributed
.
get_rank
(),
shard_index
))
self
.
build_dataset_
(
shard_index
)
#assert self.shards_data[shard_index] is not None
# Start index.
start_index
=
self
.
shards_sample_index
[
shard_index
][
data_idx
]
# Add one for label shift.
end_index
=
start_index
+
self
.
seq_length
+
1
data
=
self
.
shards_data
[
shard_index
][
start_index
:
end_index
]
return
{
'text'
:
np
.
array
(
data
,
dtype
=
np
.
int64
)}
def
build_dataset_
(
self
,
shard_index
):
# Garbage collect so we don't use a lot of memory.
# Leave the last one in case other threads have not catche up yet.
#for i in range(shard_index - 1):
for
i
in
range
(
shard_index
):
self
.
shards_data
[
i
]
=
None
self
.
shards_sample_index
[
i
]
=
None
# Read the shard.
filename
=
os
.
path
.
join
(
self
.
data_path
,
self
.
shards_name
[
shard_index
])
print
(
'loading {}'
.
format
(
filename
))
data
=
np
.
load
(
filename
,
allow_pickle
=
True
)
# Shuffle the data
rng
=
np
.
random
.
RandomState
(
self
.
initial_seed
+
shard_index
)
rng
.
shuffle
(
data
)
# Flatten.
data
=
np
.
hstack
(
data
)
size
=
(
data
.
shape
[
0
]
-
1
)
//
self
.
seq_length
last_index
=
size
*
self
.
seq_length
+
1
data
=
data
[
0
:
last_index
]
self
.
shards_data
[
shard_index
]
=
data
indices
=
np
.
arange
(
size
)
*
self
.
seq_length
rng
.
shuffle
(
indices
)
self
.
shards_sample_index
[
shard_index
]
=
indices
def
build_shard_mappings_
(
self
):
# Load the sizes file.
sizes_filename
=
os
.
path
.
join
(
self
.
data_path
,
self
.
sizes_filename
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' > loading sizes from {}'
.
format
(
sizes_filename
))
with
open
(
sizes_filename
,
'r'
)
as
f
:
self
.
master_shard_size_dict
=
json
.
load
(
f
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' found {} shards'
.
format
(
len
(
self
.
master_shard_size_dict
)))
# Adjust sizes to be a multiple of seq_length.
self
.
shard_size_dict
=
self
.
master_shard_size_dict
.
copy
()
total_samples
=
0
for
shard
in
self
.
shard_size_dict
:
size
=
self
.
shard_size_dict
[
shard
]
size
=
((
size
-
1
)
//
self
.
seq_length
)
*
self
.
seq_length
total_samples
+=
size
//
self
.
seq_length
self
.
shard_size_dict
[
shard
]
=
size
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' found {} samples in the dataset'
.
format
(
total_samples
))
# Build a list of shards.
shards_
=
np
.
sort
(
np
.
array
(
list
(
self
.
shard_size_dict
.
keys
())))
rng
=
np
.
random
.
RandomState
(
self
.
initial_seed
)
self
.
shards_name
=
np
.
copy
(
shards_
)
rng
.
shuffle
(
self
.
shards_name
)
for
i
in
range
(
1
,
self
.
max_epochs
):
shards_c
=
np
.
copy
(
shards_
)
rng
.
shuffle
(
shards_c
)
self
.
shards_name
=
np
.
append
(
self
.
shards_name
,
shards_c
)
# Build the global indexing.
self
.
shards_start_index
=
np
.
zeros
(
self
.
shards_name
.
size
,
dtype
=
np
.
int
)
self
.
shards_start_index
[
0
]
=
0
for
i
in
range
(
1
,
self
.
shards_name
.
size
):
shard
=
str
(
self
.
shards_name
[
i
-
1
])
size
=
self
.
shard_size_dict
[
shard
]
self
.
shards_start_index
[
i
]
=
self
.
shards_start_index
[
i
-
1
]
+
\
size
//
self
.
seq_length
pretrain_gpt2.py
View file @
898fcb94
...
...
@@ -24,7 +24,7 @@ from megatron import get_timers
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.data.gpt2_dataset
import
GPT2D
ataset
from
megatron.data.gpt2_dataset
import
build_train_valid_test_d
ataset
s
from
megatron.model
import
GPT2Model
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
...
...
@@ -98,56 +98,53 @@ def forward_step(data_iterator, model):
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
make_gpt2_dataloaders
():
"""Build gpt2 dataloders."""
args
=
get_args
()
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Build the datasets.
def
_build_dataset
(
name
):
return
GPT2Dataset
(
os
.
path
.
join
(
args
.
data_path
,
name
),
args
.
input_data_sizes_file
,
args
.
seq_length
,
args
.
seed
)
train_ds
=
_build_dataset
(
'train'
)
valid_ds
=
_build_dataset
(
'valid'
)
test_ds
=
_build_dataset
(
'test'
)
# Dataloaders
train
=
make_data_loader
(
train_ds
)
valid
=
make_data_loader
(
valid_ds
)
test
=
make_data_loader
(
test_ds
)
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
return
(
train
,
valid
,
test
)
def
get_train_val_test_data
():
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
args
=
get_args
()
(
train_data
,
val_data
,
test_data
)
=
(
None
,
None
,
None
)
(
train_data
,
val
id
_data
,
test_data
)
=
(
None
,
None
,
None
)
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
(
train_data
,
val_data
,
test_data
)
=
make_gpt2_dataloaders
()
flags
=
torch
.
cuda
.
LongTensor
([
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
print_rank_0
(
'> building train, validation, and test datasets '
'for GPT2 ...'
)
data_parallel_size
=
mpu
.
get_data_parallel_world_size
()
data_parallel_rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
data_parallel_size
# Number of train/valid/test samples.
train_iters
=
args
.
train_iters
eval_iters
=
(
train_iters
//
args
.
eval_interval
+
1
)
*
args
.
eval_iters
test_iters
=
args
.
eval_iters
train_val_test_num_samples
=
[
train_iters
*
global_batch_size
,
eval_iters
*
global_batch_size
,
test_iters
*
global_batch_size
]
print_rank_0
(
' > datasets target sizes (minimum size):'
)
print_rank_0
(
' train: {}'
.
format
(
train_val_test_num_samples
[
0
]))
print_rank_0
(
' validation: {}'
.
format
(
train_val_test_num_samples
[
1
]))
print_rank_0
(
' test: {}'
.
format
(
train_val_test_num_samples
[
2
]))
train_ds
,
valid_ds
,
test_ds
=
build_train_valid_test_datasets
(
data_prefix
=
args
.
data_path
,
data_impl
=
args
.
data_impl
,
splits_string
=
args
.
split
,
train_valid_test_num_samples
=
train_val_test_num_samples
,
seq_length
=
args
.
seq_length
,
seed
=
args
.
seed
,
skip_warmup
=
(
not
args
.
mmap_warmup
))
print_rank_0
(
"> finished creating GPT2 datasets ..."
)
train_data
=
make_data_loader
(
train_ds
)
valid_data
=
make_data_loader
(
valid_ds
)
test_data
=
make_data_loader
(
test_ds
)
do_train
=
train_data
is
not
None
and
args
.
train_iters
>
0
do_valid
=
valid_data
is
not
None
and
args
.
eval_iters
>
0
do_test
=
test_data
is
not
None
and
args
.
eval_iters
>
0
# Need to broadcast num_tokens and num_type_tokens.
flags
=
torch
.
cuda
.
LongTensor
(
[
int
(
do_train
),
int
(
do_valid
),
int
(
do_test
)])
else
:
flags
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
])
...
...
@@ -159,7 +156,7 @@ def get_train_val_test_data():
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
return
train_data
,
val_data
,
test_data
return
train_data
,
val
id
_data
,
test_data
if
__name__
==
"__main__"
:
...
...
pretrain_gpt2_old.py
0 → 100644
View file @
898fcb94
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain GPT2"""
import
os
import
torch
from
megatron
import
get_args
from
megatron
import
get_timers
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.data.old_gpt2_dataset
import
GPT2Dataset
from
megatron.model
import
GPT2Model
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
make_data_loader
from
megatron.utils
import
reduce_losses
def
model_provider
():
"""Build the model."""
args
=
get_args
()
print_rank_0
(
'building GPT2 model ...'
)
model
=
GPT2Model
(
num_tokentypes
=
0
,
parallel_output
=
True
)
return
model
def
get_batch
(
data_iterator
):
"""Generate a batch"""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Items and their type.
keys
=
[
'text'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
not
None
:
data
=
next
(
data_iterator
)
else
:
data
=
None
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens_
=
data_b
[
'text'
].
long
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
,
args
.
fp16
)
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
timers
=
get_timers
()
# Get the batch.
timers
(
'batch generator'
).
start
()
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
data_iterator
)
timers
(
'batch generator'
).
stop
()
# Forward model.
output
=
model
(
tokens
,
position_ids
,
attention_mask
)
losses
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
contiguous
().
float
(),
labels
)
loss_mask
=
loss_mask
.
view
(
-
1
)
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
)
/
loss_mask
.
sum
()
# Reduce loss for logging.
reduced_loss
=
reduce_losses
([
loss
])
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
make_gpt2_dataloaders
():
"""Build gpt2 dataloders."""
args
=
get_args
()
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Build the datasets.
def
_build_dataset
(
name
):
return
GPT2Dataset
(
os
.
path
.
join
(
args
.
data_path
,
name
),
args
.
input_data_sizes_file
,
args
.
seq_length
,
args
.
seed
)
train_ds
=
_build_dataset
(
'train'
)
valid_ds
=
_build_dataset
(
'valid'
)
test_ds
=
_build_dataset
(
'test'
)
# Dataloaders
train
=
make_data_loader
(
train_ds
)
valid
=
make_data_loader
(
valid_ds
)
test
=
make_data_loader
(
test_ds
)
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
return
(
train
,
valid
,
test
)
def
get_train_val_test_data
():
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
args
=
get_args
()
(
train_data
,
val_data
,
test_data
)
=
(
None
,
None
,
None
)
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
(
train_data
,
val_data
,
test_data
)
=
make_gpt2_dataloaders
()
flags
=
torch
.
cuda
.
LongTensor
([
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
else
:
flags
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
])
# Broadcast num tokens.
torch
.
distributed
.
broadcast
(
flags
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
args
.
do_train
=
flags
[
0
].
item
()
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
return
train_data
,
val_data
,
test_data
if
__name__
==
"__main__"
:
pretrain
(
get_train_val_test_data
,
model_provider
,
forward_step
,
args_defaults
=
{
'tokenizer_type'
:
'GPT2BPETokenizer'
})
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment