Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
5f8623db
Commit
5f8623db
authored
Mar 29, 2020
by
Mohammad
Browse files
both bert and gpt are working
parent
46379244
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
72 additions
and
107 deletions
+72
-107
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+3
-78
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+1
-2
pretrain_gpt2.py
pretrain_gpt2.py
+68
-27
No files found.
gpt2_data
_loader
.py
→
megatron/data/
gpt2_data
set
.py
View file @
5f8623db
...
@@ -13,71 +13,15 @@
...
@@ -13,71 +13,15 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""GPT2 dataset."""
import
json
import
json
import
os
import
os
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
torch.multiprocessing
import
Lock
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
from
megatron
import
mpu
from
megatron.data_utils.samplers
import
DistributedBatchSampler
from
megatron.data_utils.tokenization_gpt2
import
GPT2Tokenizer
def
make_gpt2_dataloaders
(
args
):
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Data parallel arguments.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
def
make_data_loader_
(
data_path
):
# Build the dataset.
dataset
=
GPT2Dataset
(
data_path
,
input_data_sizes_file
,
seq_length
,
initial_seed
)
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
train
=
make_data_loader_
(
args
.
train_data
)
valid
=
make_data_loader_
(
args
.
valid_data
)
test
=
make_data_loader_
(
args
.
test_data
)
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
# Tokenizer.
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
'gpt2'
,
cache_dir
=
args
.
cache_dir
)
eod_token
=
tokenizer
.
encoder
[
'<|endoftext|>'
]
num_tokens
=
eod_token
+
1
return
(
train
,
valid
,
test
),
num_tokens
,
eod_token
class
GPT2Dataset
(
Dataset
):
class
GPT2Dataset
(
Dataset
):
...
@@ -89,8 +33,6 @@ class GPT2Dataset(Dataset):
...
@@ -89,8 +33,6 @@ class GPT2Dataset(Dataset):
self
.
seq_length
=
seq_length
self
.
seq_length
=
seq_length
self
.
initial_seed
=
initial_seed
self
.
initial_seed
=
initial_seed
self
.
max_epochs
=
max_epochs
self
.
max_epochs
=
max_epochs
# Lock for building the dataset.
self
.
lock
=
Lock
()
# Shard stuff.
# Shard stuff.
# Dictionary from shard nameto its size (number of element).
# Dictionary from shard nameto its size (number of element).
...
@@ -120,13 +62,11 @@ class GPT2Dataset(Dataset):
...
@@ -120,13 +62,11 @@ class GPT2Dataset(Dataset):
# data index in the shard.
# data index in the shard.
data_idx
=
idx
-
self
.
shards_start_index
[
shard_index
]
data_idx
=
idx
-
self
.
shards_start_index
[
shard_index
]
# Load the shard if it is not in memory.
# Load the shard if it is not in memory.
#self.lock.acquire()
if
self
.
shards_data
[
shard_index
]
is
None
:
if
self
.
shards_data
[
shard_index
]
is
None
:
print
(
'global rank {} is building data for shard index {} ...'
.
print
(
'global rank {} is building data for shard index {} ...'
.
format
(
torch
.
distributed
.
get_rank
(),
shard_index
))
format
(
torch
.
distributed
.
get_rank
(),
shard_index
))
self
.
build_dataset_
(
shard_index
)
self
.
build_dataset_
(
shard_index
)
#assert self.shards_data[shard_index] is not None
#assert self.shards_data[shard_index] is not None
#self.lock.release()
# Start index.
# Start index.
start_index
=
self
.
shards_sample_index
[
shard_index
][
data_idx
]
start_index
=
self
.
shards_sample_index
[
shard_index
][
data_idx
]
# Add one for label shift.
# Add one for label shift.
...
@@ -194,18 +134,3 @@ class GPT2Dataset(Dataset):
...
@@ -194,18 +134,3 @@ class GPT2Dataset(Dataset):
size
=
self
.
shard_size_dict
[
shard
]
size
=
self
.
shard_size_dict
[
shard
]
self
.
shards_start_index
[
i
]
=
self
.
shards_start_index
[
i
-
1
]
+
\
self
.
shards_start_index
[
i
]
=
self
.
shards_start_index
[
i
-
1
]
+
\
size
//
self
.
seq_length
size
//
self
.
seq_length
'''
if __name__ == '__main__':
print('gpt2 data loader ...')
path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys'
dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100)
print('dataset contains {} samples'.format(dataset.data_length))
for i in range(len(dataset)):
if i % 512000 == 0:
print(i)
data = dataset[i]
'''
megatron/tokenizer/tokenizer.py
View file @
5f8623db
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Megatron tokenizer."""
"""Megatron tokenizer
s
."""
from
abc
import
ABC
from
abc
import
ABC
from
abc
import
abstractmethod
from
abc
import
abstractmethod
...
@@ -100,7 +100,6 @@ class AbstractTokenizer(ABC):
...
@@ -100,7 +100,6 @@ class AbstractTokenizer(ABC):
'tokenizer'
.
format
(
self
.
name
))
'tokenizer'
.
format
(
self
.
name
))
class
_BertWordPieceTokenizer
(
AbstractTokenizer
):
class
_BertWordPieceTokenizer
(
AbstractTokenizer
):
"""Original BERT wordpiece tokenizer."""
"""Original BERT wordpiece tokenizer."""
...
...
pretrain_gpt2.py
View file @
5f8623db
...
@@ -15,18 +15,22 @@
...
@@ -15,18 +15,22 @@
"""Pretrain GPT2"""
"""Pretrain GPT2"""
import
os
import
torch
import
torch
from
gpt2_data_loader
import
make_gpt2_dataloaders
from
megatron
import
get_args
from
megatron
import
get_args
from
megatron
import
get_timers
from
megatron
import
get_timers
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron
import
print_rank_0
from
megatron.data.gpt2_dataset
import
GPT2Dataset
from
megatron.data_utils.samplers
import
DistributedBatchSampler
from
megatron.model
import
GPT2Model
from
megatron.model
import
GPT2Model
from
megatron.training
import
pretrain
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
reduce_losses
from
megatron.utils
import
reduce_losses
import
os
def
model_provider
():
def
model_provider
():
"""Build the model."""
"""Build the model."""
...
@@ -87,7 +91,6 @@ def get_batch(data_iterator):
...
@@ -87,7 +91,6 @@ def get_batch(data_iterator):
def
forward_step
(
data_iterator
,
model
):
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
"""Forward step."""
args
=
get_args
()
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
...
@@ -109,6 +112,56 @@ def forward_step(data_iterator, model):
...
@@ -109,6 +112,56 @@ def forward_step(data_iterator, model):
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
make_gpt2_dataloaders
():
"""Build gpt2 dataloders."""
args
=
get_args
()
# Input parameters.
input_data_sizes_file
=
args
.
input_data_sizes_file
seq_length
=
args
.
seq_length
initial_seed
=
args
.
seed
# Data parallel arguments.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
def
make_data_loader_
(
data_path
):
# Build the dataset.
dataset
=
GPT2Dataset
(
data_path
,
input_data_sizes_file
,
seq_length
,
initial_seed
)
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
train
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'train'
))
valid
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'valid'
))
test
=
make_data_loader_
(
os
.
path
.
join
(
args
.
data_path
,
'test'
))
args
.
do_train
=
False
args
.
do_valid
=
False
args
.
do_test
=
False
if
train
is
not
None
:
args
.
do_train
=
True
if
valid
is
not
None
:
args
.
do_valid
=
True
if
test
is
not
None
:
args
.
do_test
=
True
return
(
train
,
valid
,
test
)
def
get_train_val_test_data
():
def
get_train_val_test_data
():
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
args
=
get_args
()
args
=
get_args
()
...
@@ -118,35 +171,23 @@ def get_train_val_test_data():
...
@@ -118,35 +171,23 @@ def get_train_val_test_data():
# Data loader only on rank 0 of each model parallel group.
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
if
mpu
.
get_model_parallel_rank
()
==
0
:
args
.
cache_dir
=
'cache'
(
train_data
,
val_data
,
test_data
)
=
make_gpt2_dataloaders
()
args
.
train_data
=
os
.
path
.
join
(
args
.
data_path
,
'train'
)
flags
=
torch
.
cuda
.
LongTensor
([
int
(
args
.
do_train
),
args
.
valid_data
=
os
.
path
.
join
(
args
.
data_path
,
'valid'
)
args
.
test_data
=
os
.
path
.
join
(
args
.
data_path
,
'test'
)
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
eod_token
=
make_gpt2_dataloaders
(
args
)
# pad.
from
megatron.tokenizer.tokenizer
import
_vocab_size_with_padding
num_tokens
=
_vocab_size_with_padding
(
num_tokens
,
args
)
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
eod_token
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
int
(
args
.
do_test
)])
else
:
else
:
token_count
s
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
flag
s
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
])
# Broadcast num tokens.
# Broadcast num tokens.
torch
.
distributed
.
broadcast
(
token_count
s
,
torch
.
distributed
.
broadcast
(
flag
s
,
mpu
.
get_model_parallel_src_rank
(),
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
group
=
mpu
.
get_model_parallel_group
())
num_tokens
=
token_counts
[
0
].
item
()
args
.
do_train
=
flags
[
0
].
item
()
eod_token
=
token_counts
[
1
].
item
()
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_train
=
token_counts
[
2
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
args
.
do_valid
=
token_counts
[
3
].
item
()
args
.
do_test
=
token_counts
[
4
].
item
()
args
.
eod_token
=
eod_token
tokenizer
=
get_tokenizer
()
args
.
eod_token
=
tokenizer
.
eod_id
return
train_data
,
val_data
,
test_data
return
train_data
,
val_data
,
test_data
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment