".github/vscode:/vscode.git/clone" did not exist on "029cb3fe4526084172c30be14278d46ecd5bf17c"
Commit 5f8623db authored by Mohammad's avatar Mohammad
Browse files

both bert and gpt are working

parent 46379244
...@@ -13,71 +13,15 @@ ...@@ -13,71 +13,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""GPT2 dataset."""
import json import json
import os import os
import numpy as np import numpy as np
import torch import torch
from torch.multiprocessing import Lock
from torch.utils.data import Dataset from torch.utils.data import Dataset
from megatron import mpu
from megatron.data_utils.samplers import DistributedBatchSampler
from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
def make_gpt2_dataloaders(args):
# Input parameters.
input_data_sizes_file = args.input_data_sizes_file
seq_length = args.seq_length
initial_seed = args.seed
# Data parallel arguments.
world_size = mpu.get_data_parallel_world_size()
rank = mpu.get_data_parallel_rank()
global_batch_size = args.batch_size * world_size
num_workers = args.num_workers
def make_data_loader_(data_path):
# Build the dataset.
dataset = GPT2Dataset(data_path, input_data_sizes_file,
seq_length, initial_seed)
# Use a simple sampler with distributed batch sampler.
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=rank,
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
pin_memory=True)
train = make_data_loader_(args.train_data)
valid = make_data_loader_(args.valid_data)
test = make_data_loader_(args.test_data)
args.do_train = False
args.do_valid = False
args.do_test = False
if train is not None:
args.do_train = True
if valid is not None:
args.do_valid = True
if test is not None:
args.do_test = True
# Tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir)
eod_token = tokenizer.encoder['<|endoftext|>']
num_tokens = eod_token + 1
return (train, valid, test), num_tokens, eod_token
class GPT2Dataset(Dataset): class GPT2Dataset(Dataset):
...@@ -89,8 +33,6 @@ class GPT2Dataset(Dataset): ...@@ -89,8 +33,6 @@ class GPT2Dataset(Dataset):
self.seq_length = seq_length self.seq_length = seq_length
self.initial_seed = initial_seed self.initial_seed = initial_seed
self.max_epochs = max_epochs self.max_epochs = max_epochs
# Lock for building the dataset.
self.lock = Lock()
# Shard stuff. # Shard stuff.
# Dictionary from shard nameto its size (number of element). # Dictionary from shard nameto its size (number of element).
...@@ -120,13 +62,11 @@ class GPT2Dataset(Dataset): ...@@ -120,13 +62,11 @@ class GPT2Dataset(Dataset):
# data index in the shard. # data index in the shard.
data_idx = idx - self.shards_start_index[shard_index] data_idx = idx - self.shards_start_index[shard_index]
# Load the shard if it is not in memory. # Load the shard if it is not in memory.
#self.lock.acquire()
if self.shards_data[shard_index] is None: if self.shards_data[shard_index] is None:
print('global rank {} is building data for shard index {} ...'. print('global rank {} is building data for shard index {} ...'.
format(torch.distributed.get_rank(), shard_index)) format(torch.distributed.get_rank(), shard_index))
self.build_dataset_(shard_index) self.build_dataset_(shard_index)
#assert self.shards_data[shard_index] is not None #assert self.shards_data[shard_index] is not None
#self.lock.release()
# Start index. # Start index.
start_index = self.shards_sample_index[shard_index][data_idx] start_index = self.shards_sample_index[shard_index][data_idx]
# Add one for label shift. # Add one for label shift.
...@@ -194,18 +134,3 @@ class GPT2Dataset(Dataset): ...@@ -194,18 +134,3 @@ class GPT2Dataset(Dataset):
size = self.shard_size_dict[shard] size = self.shard_size_dict[shard]
self.shards_start_index[i] = self.shards_start_index[i-1] + \ self.shards_start_index[i] = self.shards_start_index[i-1] + \
size // self.seq_length size // self.seq_length
'''
if __name__ == '__main__':
print('gpt2 data loader ...')
path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys'
dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100)
print('dataset contains {} samples'.format(dataset.data_length))
for i in range(len(dataset)):
if i % 512000 == 0:
print(i)
data = dataset[i]
'''
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Megatron tokenizer.""" """Megatron tokenizers."""
from abc import ABC from abc import ABC
from abc import abstractmethod from abc import abstractmethod
...@@ -100,7 +100,6 @@ class AbstractTokenizer(ABC): ...@@ -100,7 +100,6 @@ class AbstractTokenizer(ABC):
'tokenizer'.format(self.name)) 'tokenizer'.format(self.name))
class _BertWordPieceTokenizer(AbstractTokenizer): class _BertWordPieceTokenizer(AbstractTokenizer):
"""Original BERT wordpiece tokenizer.""" """Original BERT wordpiece tokenizer."""
......
...@@ -15,18 +15,22 @@ ...@@ -15,18 +15,22 @@
"""Pretrain GPT2""" """Pretrain GPT2"""
import os
import torch import torch
from gpt2_data_loader import make_gpt2_dataloaders
from megatron import get_args from megatron import get_args
from megatron import get_timers from megatron import get_timers
from megatron import get_tokenizer
from megatron import mpu from megatron import mpu
from megatron import print_rank_0 from megatron import print_rank_0
from megatron.data.gpt2_dataset import GPT2Dataset
from megatron.data_utils.samplers import DistributedBatchSampler
from megatron.model import GPT2Model from megatron.model import GPT2Model
from megatron.training import pretrain from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import reduce_losses from megatron.utils import reduce_losses
import os
def model_provider(): def model_provider():
"""Build the model.""" """Build the model."""
...@@ -87,7 +91,6 @@ def get_batch(data_iterator): ...@@ -87,7 +91,6 @@ def get_batch(data_iterator):
def forward_step(data_iterator, model): def forward_step(data_iterator, model):
"""Forward step.""" """Forward step."""
args = get_args()
timers = get_timers() timers = get_timers()
# Get the batch. # Get the batch.
...@@ -109,6 +112,56 @@ def forward_step(data_iterator, model): ...@@ -109,6 +112,56 @@ def forward_step(data_iterator, model):
return loss, {'lm loss': reduced_loss[0]} return loss, {'lm loss': reduced_loss[0]}
def make_gpt2_dataloaders():
"""Build gpt2 dataloders."""
args = get_args()
# Input parameters.
input_data_sizes_file = args.input_data_sizes_file
seq_length = args.seq_length
initial_seed = args.seed
# Data parallel arguments.
world_size = mpu.get_data_parallel_world_size()
rank = mpu.get_data_parallel_rank()
global_batch_size = args.batch_size * world_size
num_workers = args.num_workers
def make_data_loader_(data_path):
# Build the dataset.
dataset = GPT2Dataset(data_path, input_data_sizes_file,
seq_length, initial_seed)
# Use a simple sampler with distributed batch sampler.
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=rank,
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
pin_memory=True)
train = make_data_loader_(os.path.join(args.data_path, 'train'))
valid = make_data_loader_(os.path.join(args.data_path, 'valid'))
test = make_data_loader_(os.path.join(args.data_path, 'test'))
args.do_train = False
args.do_valid = False
args.do_test = False
if train is not None:
args.do_train = True
if valid is not None:
args.do_valid = True
if test is not None:
args.do_test = True
return (train, valid, test)
def get_train_val_test_data(): def get_train_val_test_data():
"""Load the data on rank zero and boradcast number of tokens to all GPUS.""" """Load the data on rank zero and boradcast number of tokens to all GPUS."""
args = get_args() args = get_args()
...@@ -118,35 +171,23 @@ def get_train_val_test_data(): ...@@ -118,35 +171,23 @@ def get_train_val_test_data():
# Data loader only on rank 0 of each model parallel group. # Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0: if mpu.get_model_parallel_rank() == 0:
args.cache_dir = 'cache' (train_data, val_data, test_data) = make_gpt2_dataloaders()
args.train_data = os.path.join(args.data_path, 'train') flags = torch.cuda.LongTensor([int(args.do_train),
args.valid_data = os.path.join(args.data_path, 'valid')
args.test_data = os.path.join(args.data_path, 'test')
(train_data, val_data, test_data), num_tokens, \
eod_token = make_gpt2_dataloaders(args)
# pad.
from megatron.tokenizer.tokenizer import _vocab_size_with_padding
num_tokens = _vocab_size_with_padding(num_tokens, args)
print_rank_0('> found end-of-document token: {}'.format(eod_token))
token_counts = torch.cuda.LongTensor([num_tokens, eod_token,
int(args.do_train),
int(args.do_valid), int(args.do_valid),
int(args.do_test)]) int(args.do_test)])
else: else:
token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) flags = torch.cuda.LongTensor([0, 0, 0])
# Broadcast num tokens. # Broadcast num tokens.
torch.distributed.broadcast(token_counts, torch.distributed.broadcast(flags,
mpu.get_model_parallel_src_rank(), mpu.get_model_parallel_src_rank(),
group=mpu.get_model_parallel_group()) group=mpu.get_model_parallel_group())
num_tokens = token_counts[0].item() args.do_train = flags[0].item()
eod_token = token_counts[1].item() args.do_valid = flags[1].item()
args.do_train = token_counts[2].item() args.do_test = flags[2].item()
args.do_valid = token_counts[3].item()
args.do_test = token_counts[4].item()
args.eod_token = eod_token tokenizer = get_tokenizer()
args.eod_token = tokenizer.eod_id
return train_data, val_data, test_data return train_data, val_data, test_data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment