Commit 0e5dfd7f authored by Mohammad's avatar Mohammad
Browse files

added gpt2 tokenizer

parent b6e0377b
...@@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}): ...@@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}):
parser = _add_validation_args(parser) parser = _add_validation_args(parser)
parser = _add_data_args(parser) parser = _add_data_args(parser)
parser = _add_autoresume_args(parser) parser = _add_autoresume_args(parser)
# TODO: Refactor
parser = _add_gpt2_args(parser)
# Custom arguments. # Custom arguments.
if extra_args_provider is not None: if extra_args_provider is not None:
...@@ -293,6 +295,8 @@ def _add_data_args(parser): ...@@ -293,6 +295,8 @@ def _add_data_args(parser):
'validation and 5% for test.') 'validation and 5% for test.')
group.add_argument('--vocab-file', type=str, required=True, group.add_argument('--vocab-file', type=str, required=True,
help='Path to the vocab file.') help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--seq-length', type=int, required=True, group.add_argument('--seq-length', type=int, required=True,
help="Maximum sequence length to process.") help="Maximum sequence length to process.")
group.add_argument('--mask-prob', type=float, default=0.15, group.add_argument('--mask-prob', type=float, default=0.15,
...@@ -330,19 +334,19 @@ def _add_autoresume_args(parser): ...@@ -330,19 +334,19 @@ def _add_autoresume_args(parser):
######################################################################## ########################################################################
def add_training_args_(parser): def _add_gpt2_args(parser):
"""Training arguments.""" group = parser.add_argument_group(title='gpt2')
group = parser.add_argument_group('train', 'training configurations') group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
help='The filename containing all the shards '
# Batch prodecuer arguments 'sizes for numpy data loader')
group.add_argument('--reset-position-ids', action='store_true', group.add_argument('--reset-position-ids', action='store_true',
help='Reset posistion ids after end-of-document token.') help='Reset posistion ids after end-of-document token.')
group.add_argument('--reset-attention-mask', action='store_true', group.add_argument('--reset-attention-mask', action='store_true',
help='Reset self attention maske after ' help='Reset self attention maske after '
'end-of-document token.') 'end-of-document token.')
group.add_argument('--eod-mask-loss', action='store_true', group.add_argument('--eod-mask-loss', action='store_true',
help='Mask loss for the end of document tokens') help='Mask loss for the end of document tokens.')
return parser return parser
...@@ -411,18 +415,6 @@ def add_data_args_(parser): ...@@ -411,18 +415,6 @@ def add_data_args_(parser):
choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'], choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
help='Which data loader to use. Default varies by model.') help='Which data loader to use. Default varies by model.')
group.add_argument('--train-data', nargs='+', default=None,
help='Whitespace separated paths or corpora names '
'for training.')
group.add_argument('--valid-data', nargs='*', default=None,
help='path(s) to the validation data.')
group.add_argument('--test-data', nargs='*', default=None,
help='path(s) to the testing data.')
# arguments for binary data loader
# arguments for numpy data loader
group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
help='the filename containing all the shards sizes for numpy data loader')
return parser return parser
...@@ -19,6 +19,7 @@ from abc import ABC ...@@ -19,6 +19,7 @@ from abc import ABC
from abc import abstractmethod from abc import abstractmethod
from .bert_tokenization import FullTokenizer as FullBertTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer
def build_tokenizer(args): def build_tokenizer(args):
...@@ -28,9 +29,13 @@ def build_tokenizer(args): ...@@ -28,9 +29,13 @@ def build_tokenizer(args):
flush=True) flush=True)
# Select and instantiate the tokenizer. # Select and instantiate the tokenizer.
assert args.vocab_file is not None
if args.tokenizer_type == 'BertWordPieceLowerCase': if args.tokenizer_type == 'BertWordPieceLowerCase':
tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True) lower_case=True)
elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
else: else:
raise NotImplementedError('{} tokenizer is not ' raise NotImplementedError('{} tokenizer is not '
'implemented.'.format(args.tokenizer_type)) 'implemented.'.format(args.tokenizer_type))
...@@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer): ...@@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
@property @property
def pad(self): def pad(self):
return self.pad_id return self.pad_id
class _GPT2BPETokenizer(AbstractTokenizer):
"""Original GPT2 BPE tokenizer."""
def __init__(self, vocab_file, merge_file):
name = 'GPT2 BPE'
super().__init__(name)
self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
special_tokens=[], max_len=None)
self.eod_id = self.tokenizer.encoder['<|endoftext|>']
@property
def vocab_size(self):
return len(self.tokenizer.encoder)
def tokenize(self, text):
return self.tokenizer.encode(text)
@property
def eod(self):
return self.eod_id
...@@ -17,20 +17,16 @@ ...@@ -17,20 +17,16 @@
import torch import torch
from gpt2_data_loader import make_gpt2_dataloaders
from megatron import get_args from megatron import get_args
from megatron import get_timers from megatron import get_timers
from configure_data import configure_data
from gpt2_data_loader import make_gpt2_dataloaders
from megatron import mpu from megatron import mpu
from megatron import print_rank_0
from megatron.model import GPT2Model from megatron.model import GPT2Model
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import get_ltor_masks_and_position_ids
from megatron import print_rank_0
from megatron.utils import reduce_losses from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding import os
from megatron.training import pretrain
def model_provider(): def model_provider():
"""Build the model.""" """Build the model."""
...@@ -97,7 +93,7 @@ def forward_step(data_iterator, model): ...@@ -97,7 +93,7 @@ def forward_step(data_iterator, model):
# Get the batch. # Get the batch.
timers('batch generator').start() timers('batch generator').start()
tokens, labels, loss_mask, attention_mask, position_ids = get_batch( tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator, args, timers) data_iterator)
timers('batch generator').stop() timers('batch generator').stop()
# Forward model. # Forward model.
...@@ -121,28 +117,17 @@ def get_train_val_test_data(): ...@@ -121,28 +117,17 @@ def get_train_val_test_data():
# Data loader only on rank 0 of each model parallel group. # Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0: if mpu.get_model_parallel_rank() == 0:
if args.data_loader == 'numpy':
assert len(args.train_data) == 1 args.cache_dir = 'cache'
args.train_data = args.train_data[0] args.train_data = os.path.join(args.data_path, 'train')
assert len(args.valid_data) == 1 args.valid_data = os.path.join(args.data_path, 'valid')
args.valid_data = args.valid_data[0] args.test_data = os.path.join(args.data_path, 'test')
assert len(args.test_data) == 1 (train_data, val_data, test_data), num_tokens, \
args.test_data = args.test_data[0] eod_token = make_gpt2_dataloaders(args)
(train_data, val_data, test_data), num_tokens, \
eod_token = make_gpt2_dataloaders(args)
elif args.data_loader == 'raw' or args.data_loader == 'lazy':
data_config = configure_data()
data_config.set_defaults(data_set_type='GPT2', transpose=False)
(train_data, val_data, test_data), tokenizer = data_config.apply(
args)
num_tokens = tokenizer.num_tokens
eod_token = tokenizer.get_command('eos').Id
assert eod_token == tokenizer.get_command('pad').Id
else:
print("Unsupported data loader for GPT2.")
exit(1)
# pad. # pad.
num_tokens = vocab_size_with_padding(num_tokens, args) from megatron.tokenizer.tokenizer import _vocab_size_with_padding
num_tokens = _vocab_size_with_padding(num_tokens, args)
print_rank_0('> found end-of-document token: {}'.format(eod_token)) print_rank_0('> found end-of-document token: {}'.format(eod_token))
token_counts = torch.cuda.LongTensor([num_tokens, eod_token, token_counts = torch.cuda.LongTensor([num_tokens, eod_token,
int(args.do_train), int(args.do_train),
...@@ -161,7 +146,6 @@ def get_train_val_test_data(): ...@@ -161,7 +146,6 @@ def get_train_val_test_data():
args.do_valid = token_counts[3].item() args.do_valid = token_counts[3].item()
args.do_test = token_counts[4].item() args.do_test = token_counts[4].item()
args.vocab_size = num_tokens
args.eod_token = eod_token args.eod_token = eod_token
return train_data, val_data, test_data return train_data, val_data, test_data
...@@ -169,5 +153,5 @@ def get_train_val_test_data(): ...@@ -169,5 +153,5 @@ def get_train_val_test_data():
if __name__ == "__main__": if __name__ == "__main__":
pretrain(get_train_val_test_data, pretrain(get_train_val_test_data, model_provider, forward_step,
model_provider, forward_step) args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment