Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
5f8623db
Commit
5f8623db
authored
Mar 29, 2020
by
Mohammad
Browse files
both bert and gpt are working
parent
46379244
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
72 additions
and
107 deletions
+72
-107
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+3
-78
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+1
-2
pretrain_gpt2.py
pretrain_gpt2.py
+68
-27
No files found.
gpt2_data
_loader
.py
→
megatron/data/
gpt2_data
set
.py
View file @
5f8623db
...
...
@@ -13,71 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT2 dataset."""
import
json
import
os
import
numpy
as
np
import
torch
from
torch.multiprocessing
import
Lock
from
torch.utils.data
import
Dataset
from
megatron
import
mpu
from
megatron.data_utils.samplers
import
DistributedBatchSampler
from
megatron.data_utils.tokenization_gpt2
import
GPT2Tokenizer
def
make_gpt2_dataloaders
(
args
):
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Data parallel arguments.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
def
make_data_loader_
(
data_path
):
# Build the dataset.
dataset
=
GPT2Dataset
(
data_path
,
input_data_sizes_file
,
seq_length
,
initial_seed
)
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
train
=
make_data_loader_
(
args
.
train_data
)
valid
=
make_data_loader_
(
args
.
valid_data
)
test
=
make_data_loader_
(
args
.
test_data
)
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
# Tokenizer.
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
'gpt2'
,
cache_dir
=
args
.
cache_dir
)
eod_token
=
tokenizer
.
encoder
[
'<|endoftext|>'
]
num_tokens
=
eod_token
+
1
return
(
train
,
valid
,
test
),
num_tokens
,
eod_token
class
GPT2Dataset
(
Dataset
):
...
...
@@ -89,8 +33,6 @@ class GPT2Dataset(Dataset):
self
.
seq_length
=
seq_length
self
.
initial_seed
=
initial_seed
self
.
max_epochs
=
max_epochs
# Lock for building the dataset.
self
.
lock
=
Lock
()
# Shard stuff.
# Dictionary from shard nameto its size (number of element).
...
...
@@ -120,13 +62,11 @@ class GPT2Dataset(Dataset):
# data index in the shard.
data_idx
=
idx
-
self
.
shards_start_index
[
shard_index
]
# Load the shard if it is not in memory.
#self.lock.acquire()
if
self
.
shards_data
[
shard_index
]
is
None
:
print
(
'global rank {} is building data for shard index {} ...'
.
format
(
torch
.
distributed
.
get_rank
(),
shard_index
))
self
.
build_dataset_
(
shard_index
)
#assert self.shards_data[shard_index] is not None
#self.lock.release()
# Start index.
start_index
=
self
.
shards_sample_index
[
shard_index
][
data_idx
]
# Add one for label shift.
...
...
@@ -194,18 +134,3 @@ class GPT2Dataset(Dataset):
size
=
self
.
shard_size_dict
[
shard
]
self
.
shards_start_index
[
i
]
=
self
.
shards_start_index
[
i
-
1
]
+
\
size
//
self
.
seq_length
'''
if __name__ == '__main__':
print('gpt2 data loader ...')
path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys'
dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100)
print('dataset contains {} samples'.format(dataset.data_length))
for i in range(len(dataset)):
if i % 512000 == 0:
print(i)
data = dataset[i]
'''
megatron/tokenizer/tokenizer.py
View file @
5f8623db
...
...
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizer."""
"""Megatron tokenizer
s
."""
from
abc
import
ABC
from
abc
import
abstractmethod
...
...
@@ -100,7 +100,6 @@ class AbstractTokenizer(ABC):
'tokenizer'
.
format
(
self
.
name
))
class
_BertWordPieceTokenizer
(
AbstractTokenizer
):
"""Original BERT wordpiece tokenizer."""
...
...
pretrain_gpt2.py
View file @
5f8623db
...
...
@@ -15,18 +15,22 @@
"""Pretrain GPT2"""
import
os
import
torch
from
gpt2_data_loader
import
make_gpt2_dataloaders
from
megatron
import
get_args
from
megatron
import
get_timers
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.data.gpt2_dataset
import
GPT2Dataset
from
megatron.data_utils.samplers
import
DistributedBatchSampler
from
megatron.model
import
GPT2Model
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
reduce_losses
import
os
def
model_provider
():
"""Build the model."""
...
...
@@ -87,7 +91,6 @@ def get_batch(data_iterator):
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
args
=
get_args
()
timers
=
get_timers
()
# Get the batch.
...
...
@@ -109,44 +112,82 @@ def forward_step(data_iterator, model):
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
make_gpt2_dataloaders
():
"""Build gpt2 dataloders."""
args
=
get_args
()
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Data parallel arguments.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
def
make_data_loader_
(
data_path
):
# Build the dataset.
dataset
=
GPT2Dataset
(
data_path
,
input_data_sizes_file
,
seq_length
,
initial_seed
)
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
train
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'train'
))
valid
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'valid'
))
test
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'test'
))
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
return
(
train
,
valid
,
test
)
def
get_train_val_test_data
():
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
args
=
get_args
()
(
train_data
,
val_data
,
test_data
)
=
(
None
,
None
,
None
)
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
args
.
cache_dir
=
'cache'
args
.
train_data
=
os
.
path
.
join
(
args
.
data_path
,
'train'
)
args
.
valid_data
=
os
.
path
.
join
(
args
.
data_path
,
'valid'
)
args
.
test_data
=
os
.
path
.
join
(
args
.
data_path
,
'test'
)
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
eod_token
=
make_gpt2_dataloaders
(
args
)
# pad.
from
megatron.tokenizer.tokenizer
import
_vocab_size_with_padding
num_tokens
=
_vocab_size_with_padding
(
num_tokens
,
args
)
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
eod_token
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
(
train_data
,
val_data
,
test_data
)
=
make_gpt2_dataloaders
()
flags
=
torch
.
cuda
.
LongTensor
([
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
else
:
token_count
s
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
flag
s
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
])
# Broadcast num tokens.
torch
.
distributed
.
broadcast
(
token_count
s
,
torch
.
distributed
.
broadcast
(
flag
s
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
num_tokens
=
token_counts
[
0
].
item
()
eod_token
=
token_counts
[
1
].
item
()
args
.
do_train
=
token_counts
[
2
].
item
()
args
.
do_valid
=
token_counts
[
3
].
item
()
args
.
do_test
=
token_counts
[
4
].
item
()
args
.
do_train
=
flags
[
0
].
item
()
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
args
.
eod_token
=
eod_token
tokenizer
=
get_tokenizer
()
args
.
eod_token
=
tokenizer
.
eod_id
return
train_data
,
val_data
,
test_data
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment