Commit 6c521eb1 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'staging_args' into 'staging'

Major refactor: args, global variables, tokenizer

See merge request ADLR/megatron-lm!36
parents 37ff534f b07f1d0a
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron global variables."""
import os
import sys
import time
import torch
from megatron.tokenizer import build_tokenizer
from .arguments import parse_args
_GLOBAL_ARGS = None
_GLOBAL_TOKENIZER = None
_GLOBAL_TENSORBOARD_WRITER = None
_GLOBAL_ADLR_AUTORESUME = None
_GLOBAL_TIMERS = None
def get_args():
"""Return arguments."""
_ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
return _GLOBAL_ARGS
def get_tokenizer():
"""Return tokenizer."""
_ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
return _GLOBAL_TOKENIZER
def get_tensorboard_writer():
"""Return tensorboard writer. It can be None so no need
to check if it is initialized."""
return _GLOBAL_TENSORBOARD_WRITER
def get_adlr_autoresume():
"""ADLR autoresume object. It can be None so no need
to check if it is initialized."""
return _GLOBAL_ADLR_AUTORESUME
def get_timers():
"""Return timers."""
_ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
return _GLOBAL_TIMERS
def set_global_variables(extra_args_provider=None, args_defaults={}):
"""Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
args = _parse_args(extra_args_provider=extra_args_provider,
defaults=args_defaults)
_build_tokenizer(args)
_set_tensorboard_writer(args)
_set_adlr_autoresume(args)
_set_timers()
def _parse_args(extra_args_provider=None, defaults={}):
"""Parse entire arguments."""
global _GLOBAL_ARGS
_ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
_GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
defaults=defaults)
return _GLOBAL_ARGS
def _build_tokenizer(args):
"""Initialize tokenizer."""
global _GLOBAL_TOKENIZER
_ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
_GLOBAL_TOKENIZER = build_tokenizer(args)
def _set_tensorboard_writer(args):
"""Set tensorboard writer."""
global _GLOBAL_TENSORBOARD_WRITER
_ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
'tensorboard writer')
if hasattr(args, 'tensorboard_dir') and \
args.tensorboard_dir and args.rank == 0:
try:
from torch.utils.tensorboard import SummaryWriter
print('> setting tensorboard ...')
_GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
log_dir=args.tensorboard_dir)
except ModuleNotFoundError:
print('WARNING: TensorBoard writing requested but is not '
'available (are you using PyTorch 1.1.0 or later?), '
'no TensorBoard logs will be written.', flush=True)
def _set_adlr_autoresume(args):
"""Initialize ADLR autoresume."""
global _GLOBAL_ADLR_AUTORESUME
_ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
if args.adlr_autoresume:
if args.rank == 0:
print('enabling autoresume ...', flush=True)
sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
try:
from userlib.auto_resume import AutoResume
except:
print('ADLR autoresume is not available, exiting ...')
sys.exit()
_GLOBAL_ADLR_AUTORESUME = AutoResume
def _set_timers():
"""Initialize timers."""
global _GLOBAL_TIMERS
_ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
_GLOBAL_TIMERS = Timers()
def _ensure_var_is_initialized(var, name):
"""Make sure the input variable is not None."""
assert var is not None, '{} is not initialized.'.format(name)
def _ensure_var_is_not_initialized(var, name):
"""Make sure the input variable is not None."""
assert var is None, '{} is already initialized.'.format(name)
class _Timer:
"""Timer."""
def __init__(self, name):
self.name_ = name
self.elapsed_ = 0.0
self.started_ = False
self.start_time = time.time()
def start(self):
"""Start the timer."""
assert not self.started_, 'timer has already been started'
torch.cuda.synchronize()
self.start_time = time.time()
self.started_ = True
def stop(self):
"""Stop the timer."""
assert self.started_, 'timer is not started'
torch.cuda.synchronize()
self.elapsed_ += (time.time() - self.start_time)
self.started_ = False
def reset(self):
"""Reset timer."""
self.elapsed_ = 0.0
self.started_ = False
def elapsed(self, reset=True):
"""Calculate the elapsed time."""
started_ = self.started_
# If the timing in progress, end it first.
if self.started_:
self.stop()
# Get the elapsed time.
elapsed_ = self.elapsed_
# Reset the elapsed time
if reset:
self.reset()
# If timing was in progress, set it back.
if started_:
self.start()
return elapsed_
class Timers:
"""Group of timers."""
def __init__(self):
self.timers = {}
def __call__(self, name):
if name not in self.timers:
self.timers[name] = _Timer(name)
return self.timers[name]
def write(self, names, writer, iteration, normalizer=1.0, reset=False):
"""Write timers to a tensorboard writer"""
# currently when using add_scalars,
# torch.utils.add_scalars makes each timer its own run, which
# polutes the runs list, so we just add each as a scalar
assert normalizer > 0.0
for name in names:
value = self.timers[name].elapsed(reset=reset) / normalizer
writer.add_scalar(name + '_time', value, iteration)
def log(self, names, normalizer=1.0, reset=True):
"""Log a group of timers."""
assert normalizer > 0.0
string = 'time (ms)'
for name in names:
elapsed_time = self.timers[name].elapsed(
reset=reset) * 1000.0 / normalizer
string += ' | {}: {:.2f}'.format(name, elapsed_time)
if torch.distributed.is_initialized():
if torch.distributed.get_rank() == 0:
print(string, flush=True)
else:
print(string, flush=True)
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron initialization."""
import random
import os
import numpy as np
import torch
from megatron import get_adlr_autoresume
from megatron import get_args
from megatron import get_tensorboard_writer
from megatron import mpu
from megatron.global_vars import set_global_variables
def initialize_megatron(extra_args_provider=None, args_defaults={}):
"""Set global variables, initialize distributed, and
set autoresume and random seeds."""
# Make sure cuda is available.
assert torch.cuda.is_available(), 'Megatron requires CUDA.'
# Parse args, build tokenizer, and set adlr-autoresume,
# tensorboard-writer, and timers.
set_global_variables(extra_args_provider=extra_args_provider,
args_defaults=args_defaults)
# Pytorch distributed.
_initialize_distributed()
# Autoresume.
_init_autoresume()
# Random seeds for reproducibility.
args = get_args()
if args.rank == 0:
print('> setting random seeds to {} ...'.format(args.seed))
_set_random_seed(args.seed)
# Write arguments to tensorboard.
_write_args_to_tensorboard()
def _initialize_distributed():
"""Initialize torch.distributed and mpu."""
args = get_args()
if torch.distributed.is_initialized():
if args.rank == 0:
print('torch distributed is already initialized, '
'skipping initialization ...', flush=True)
args.rank = torch.distributed.get_rank()
args.world_size = torch.distributed.get_world_size()
device = torch.cuda.current_device()
local_rank = args.rank % torch.cuda.device_count()
assert local_rank == device, \
'expected local-rank to be the same as rank % device-count.'
else:
if args.rank == 0:
print('> initializing torch distributed ...', flush=True)
# Manually set the device ids.
device = args.rank % torch.cuda.device_count()
if args.local_rank is not None:
assert args.local_rank == device, \
'expected local-rank to be the same as rank % device-count.'
else:
args.local_rank = device
torch.cuda.set_device(device)
# Call the init process
init_method = 'tcp://'
master_ip = os.getenv('MASTER_ADDR', 'localhost')
master_port = os.getenv('MASTER_PORT', '6000')
init_method += master_ip + ':' + master_port
torch.distributed.init_process_group(
backend=args.distributed_backend,
world_size=args.world_size, rank=args.rank,
init_method=init_method)
# Set the model-parallel / data-parallel communicators.
mpu.initialize_model_parallel(args.model_parallel_size)
def _init_autoresume():
"""Set autoresume start time."""
autoresume = get_adlr_autoresume()
if autoresume:
torch.distributed.barrier()
autoresume.init()
torch.distributed.barrier()
def _set_random_seed(seed):
"""Set random seed for reproducability."""
if seed is not None and seed > 0:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
mpu.model_parallel_cuda_manual_seed(seed)
else:
raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
def _write_args_to_tensorboard():
"""Write arguments to tensorboard."""
args = get_args()
writer = get_tensorboard_writer()
if writer:
for arg in vars(args):
writer.add_text(arg, str(getattr(args, arg)))
...@@ -12,59 +12,68 @@ ...@@ -12,59 +12,68 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""PyTorch DataLoader for TFRecords"""
import torch """Learning rate decay functions."""
from torch.optim.lr_scheduler import _LRScheduler
import math
from megatron.utils import print_rank_0 import math
from megatron import print_rank_0
class AnnealingLR(_LRScheduler):
"""Anneals the learning rate"""
DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None'] class AnnealingLR(object):
"""Anneals the learning rate."""
def __init__(self, optimizer, start_lr, warmup_iter, num_iters, def __init__(self, optimizer, start_lr,
decay_style=None, last_iter=-1, min_lr=0.0, warmup_iter, total_iters,
decay_style, last_iter, min_lr=0.0,
use_checkpoint_lr_scheduler=True, use_checkpoint_lr_scheduler=True,
override_lr_scheduler=False): override_lr_scheduler=False):
# Class values.
self.optimizer = optimizer self.optimizer = optimizer
self.start_lr = start_lr self.start_lr = start_lr
self.min_lr = min_lr self.min_lr = min_lr
self.warmup_iter = warmup_iter self.warmup_iter = warmup_iter
self.num_iters = last_iter + 1 self.num_iters = last_iter
self.end_iter = num_iters self.end_iter = total_iters
self.decay_style = decay_style.lower() if isinstance(decay_style, str) \ assert self.end_iter > 0
else None self.decay_style = decay_style
self.override_lr_scheduler = override_lr_scheduler self.override_lr_scheduler = override_lr_scheduler
self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
if self.override_lr_scheduler: if self.override_lr_scheduler:
assert not self.use_checkpoint_lr_scheduler, 'both override and '\ assert not self.use_checkpoint_lr_scheduler, 'both override and '\
'use-checkpoint are set.' 'use-checkpoint are set.'
# Set the learning rate
self.step(self.num_iters) self.step(self.num_iters)
if torch.distributed.get_rank() == 0:
print('learning rate decaying', decay_style) print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
def get_lr(self): def get_lr(self):
# https://openreview.net/pdf?id=BJYwwY9ll pg. 4 """Learning rate decay functions from:
https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
# Warmup.
if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
return float(self.start_lr) * num_iters_ / self.warmup_iter return float(self.start_lr) * num_iters_ / self.warmup_iter
num_iters_ = num_iters_ - self.warmup_iter
if self.decay_style == 'linear':
lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
elif self.decay_style == 'cosine':
lr = self.start_lr / 2.0 * (math.cos(
math.pi * num_iters_ / self.end_iter) + 1)
elif self.decay_style == 'exponential':
# exp(-0.693) = 1/2
lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
else: else:
if self.decay_style == self.DECAY_STYLES[0]: lr = self.start_lr
lr = self.start_lr * ((self.end_iter - (num_iters_ - self.warmup_iter)) / self.end_iter) return max(lr, self.min_lr)
elif self.decay_style == self.DECAY_STYLES[1]:
lr = self.start_lr / 2.0 * (math.cos(math.pi * (num_iters_ - self.warmup_iter) / self.end_iter) + 1)
elif self.decay_style == self.DECAY_STYLES[2]:
# exp(-0.693) = 1/2
lr = self.start_lr * math.exp(-0.693 * (num_iters_ - self.warmup_iter) / self.end_iter)
else:
lr = self.start_lr
return max(lr, self.min_lr)
def step(self, step_num=None): def step(self, step_num=None):
"""Set lr for all parameters groups."""
if step_num is None: if step_num is None:
step_num = self.num_iters + 1 step_num = self.num_iters + 1
self.num_iters = step_num self.num_iters = step_num
...@@ -72,42 +81,46 @@ class AnnealingLR(_LRScheduler): ...@@ -72,42 +81,46 @@ class AnnealingLR(_LRScheduler):
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
group['lr'] = new_lr group['lr'] = new_lr
def state_dict(self): def state_dict(self):
sd = { state_dict = {
'start_lr': self.start_lr, 'start_lr': self.start_lr,
'warmup_iter': self.warmup_iter, 'warmup_iter': self.warmup_iter,
'num_iters': self.num_iters, 'num_iters': self.num_iters,
'decay_style': self.decay_style, 'decay_style': self.decay_style,
'end_iter': self.end_iter, 'end_iter': self.end_iter,
'min_lr': self.min_lr 'min_lr': self.min_lr
} }
return sd return state_dict
def check_and_set_(self, cls_value, sd_value, name): def _check_and_set(self, cls_value, sd_value, name):
"""Auxiliary function for checking the values in the checkpoint and
setting them."""
if self.override_lr_scheduler: if self.override_lr_scheduler:
print_rank_0(' > overriding {} value to {}'.format(name, cls_value)) print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
return cls_value return cls_value
else:
if not self.use_checkpoint_lr_scheduler: if not self.use_checkpoint_lr_scheduler:
assert cls_value == sd_value, 'AnnealingLR: class input value' \ assert cls_value == sd_value, 'AnnealingLR: class input value' \
'and checkpoint values for {} do not match'.format(name) 'and checkpoint values for {} do not match'.format(name)
print_rank_0(' > using checkpoint value {} for {}'.format(sd_value, print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
name)) name))
return sd_value return sd_value
def load_state_dict(self, sd): def load_state_dict(self, sd):
self.start_lr = self.check_and_set_(self.start_lr, sd['start_lr'], self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
'learning rate') 'learning rate')
self.min_lr = self.check_and_set_(self.min_lr, sd['min_lr'], self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
'minimum learning rate') 'minimum learning rate')
self.warmup_iter = self.check_and_set_(self.warmup_iter, self.warmup_iter = self._check_and_set(self.warmup_iter,
sd['warmup_iter'], sd['warmup_iter'],
'warmup iterations') 'warmup iterations')
self.end_iter = self.check_and_set_(self.end_iter, sd['end_iter'], self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
'total number of iterations') 'total number of iterations')
self.decay_style = self.check_and_set_(self.decay_style, self.decay_style = self._check_and_set(self.decay_style,
sd['decay_style'], sd['decay_style'],
'decay style') 'decay style')
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import torch import torch
from megatron import get_args
from megatron.module import MegatronModule from megatron.module import MegatronModule
from .language_model import parallel_lm_logits from .language_model import parallel_lm_logits
...@@ -106,60 +107,33 @@ class BertLMHead(MegatronModule): ...@@ -106,60 +107,33 @@ class BertLMHead(MegatronModule):
class BertModel(MegatronModule): class BertModel(MegatronModule):
"""Bert Language model.""" """Bert Language model."""
def __init__(self, def __init__(self, num_tokentypes=2, add_binary_head=True,
num_layers, parallel_output=True):
vocab_size,
hidden_size,
num_attention_heads,
embedding_dropout_prob,
attention_dropout_prob,
output_dropout_prob,
max_sequence_length,
checkpoint_activations,
checkpoint_num_layers=1,
add_binary_head=False,
layernorm_epsilon=1.0e-5,
init_method_std=0.02,
num_tokentypes=0,
parallel_output=True,
apply_query_key_layer_scaling=False,
attention_softmax_in_fp32=False):
super(BertModel, self).__init__() super(BertModel, self).__init__()
args = get_args()
self.add_binary_head = add_binary_head self.add_binary_head = add_binary_head
self.parallel_output = parallel_output self.parallel_output = parallel_output
init_method = init_method_normal(init_method_std) init_method = init_method_normal(args.init_method_std)
scaled_init_method = scaled_init_method_normal(args.init_method_std,
args.num_layers)
self.language_model, self._language_model_key = get_language_model( self.language_model, self._language_model_key = get_language_model(
num_layers=num_layers, attention_mask_func=bert_attention_mask_func,
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
embedding_dropout_prob=embedding_dropout_prob,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
max_sequence_length=max_sequence_length,
num_tokentypes=num_tokentypes, num_tokentypes=num_tokentypes,
add_pooler=self.add_binary_head, add_pooler=self.add_binary_head,
attention_mask_func=bert_attention_mask_func,
checkpoint_activations=checkpoint_activations,
checkpoint_num_layers=checkpoint_num_layers,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method, init_method=init_method,
scaled_init_method=scaled_init_method_normal(init_method_std, scaled_init_method=scaled_init_method)
num_layers),
residual_connection_post_layernorm=False,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
attention_softmax_in_fp32=attention_softmax_in_fp32)
self.lm_head = BertLMHead( self.lm_head = BertLMHead(
self.language_model.embedding.word_embeddings.weight.size(0), self.language_model.embedding.word_embeddings.weight.size(0),
hidden_size, init_method, layernorm_epsilon, parallel_output) args.hidden_size, init_method, args.layernorm_epsilon,
parallel_output)
self._lm_head_key = 'lm_head' self._lm_head_key = 'lm_head'
if self.add_binary_head: if self.add_binary_head:
self.binary_head = get_linear_layer(hidden_size, 2, init_method) self.binary_head = get_linear_layer(args.hidden_size, 2,
init_method)
self._binary_head_key = 'binary_head' self._binary_head_key = 'binary_head'
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import torch import torch
from megatron import get_args
from megatron.model.bert_model import bert_attention_mask_func from megatron.model.bert_model import bert_attention_mask_func
from megatron.model.bert_model import bert_extended_attention_mask from megatron.model.bert_model import bert_extended_attention_mask
from megatron.model.bert_model import bert_position_ids from megatron.model.bert_model import bert_position_ids
...@@ -25,59 +26,29 @@ from megatron.model.utils import get_linear_layer ...@@ -25,59 +26,29 @@ from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
from megatron.model.utils import scaled_init_method_normal from megatron.model.utils import scaled_init_method_normal
from megatron.module import MegatronModule from megatron.module import MegatronModule
from megatron.utils import print_rank_0 from megatron import print_rank_0
class Classification(MegatronModule): class Classification(MegatronModule):
def __init__(self, def __init__(self, num_classes, num_tokentypes=2):
num_classes,
num_layers,
vocab_size,
hidden_size,
num_attention_heads,
embedding_dropout_prob,
attention_dropout_prob,
output_dropout_prob,
max_sequence_length,
checkpoint_activations,
checkpoint_num_layers=1,
layernorm_epsilon=1.0e-5,
init_method_std=0.02,
num_tokentypes=2,
apply_query_key_layer_scaling=False,
attention_softmax_in_fp32=False):
super(Classification, self).__init__() super(Classification, self).__init__()
args = get_args()
self.num_classes = num_classes self.num_classes = num_classes
init_method = init_method_normal(init_method_std) init_method = init_method_normal(args.init_method_std)
self.language_model, self._language_model_key = get_language_model( self.language_model, self._language_model_key = get_language_model(
num_layers=num_layers, attention_mask_func=bert_attention_mask_func,
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
embedding_dropout_prob=embedding_dropout_prob,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
max_sequence_length=max_sequence_length,
num_tokentypes=num_tokentypes, num_tokentypes=num_tokentypes,
add_pooler=True, add_pooler=True,
attention_mask_func=bert_attention_mask_func,
checkpoint_activations=checkpoint_activations,
checkpoint_num_layers=checkpoint_num_layers,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method, init_method=init_method,
scaled_init_method=scaled_init_method_normal(init_method_std, scaled_init_method=scaled_init_method_normal(args.init_method_std,
num_layers), args.num_layers))
residual_connection_post_layernorm=False,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
attention_softmax_in_fp32=attention_softmax_in_fp32)
# Multi-choice head. # Multi-choice head.
self.classification_dropout = torch.nn.Dropout(output_dropout_prob) self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
self.classification_head = get_linear_layer(hidden_size, self.classification_head = get_linear_layer(args.hidden_size,
self.num_classes, self.num_classes,
init_method) init_method)
self._classification_head_key = 'classification_head' self._classification_head_key = 'classification_head'
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import torch import torch
from megatron import get_args
from megatron.module import MegatronModule from megatron.module import MegatronModule
from .language_model import parallel_lm_logits from .language_model import parallel_lm_logits
...@@ -34,49 +35,19 @@ def gpt2_attention_mask_func(attention_scores, ltor_mask): ...@@ -34,49 +35,19 @@ def gpt2_attention_mask_func(attention_scores, ltor_mask):
class GPT2Model(MegatronModule): class GPT2Model(MegatronModule):
"""GPT-2 Language model.""" """GPT-2 Language model."""
def __init__(self, def __init__(self, num_tokentypes=0, parallel_output=True):
num_layers,
vocab_size,
hidden_size,
num_attention_heads,
embedding_dropout_prob,
attention_dropout_prob,
output_dropout_prob,
max_sequence_length,
checkpoint_activations,
checkpoint_num_layers=1,
layernorm_epsilon=1.0e-5,
init_method_std=0.02,
num_tokentypes=0,
parallel_output=True,
apply_query_key_layer_scaling=False,
attention_softmax_in_fp32=False):
super(GPT2Model, self).__init__() super(GPT2Model, self).__init__()
args = get_args()
self.parallel_output = parallel_output self.parallel_output = parallel_output
self.language_model, self._language_model_key = get_language_model( self.language_model, self._language_model_key = get_language_model(
num_layers=num_layers, attention_mask_func=gpt2_attention_mask_func,
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
embedding_dropout_prob=embedding_dropout_prob,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
max_sequence_length=max_sequence_length,
num_tokentypes=num_tokentypes, num_tokentypes=num_tokentypes,
add_pooler=False, add_pooler=False,
attention_mask_func=gpt2_attention_mask_func, init_method=init_method_normal(args.init_method_std),
checkpoint_activations=checkpoint_activations, scaled_init_method=scaled_init_method_normal(args.init_method_std,
checkpoint_num_layers=checkpoint_num_layers, args.num_layers))
layernorm_epsilon=layernorm_epsilon,
init_method=init_method_normal(init_method_std),
scaled_init_method=scaled_init_method_normal(init_method_std,
num_layers),
residual_connection_post_layernorm=False,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
attention_softmax_in_fp32=attention_softmax_in_fp32)
def forward(self, input_ids, position_ids, attention_mask, def forward(self, input_ids, position_ids, attention_mask,
......
...@@ -18,13 +18,13 @@ ...@@ -18,13 +18,13 @@
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.module import MegatronModule from megatron.module import MegatronModule
from .transformer import ParallelTransformer from megatron.model.transformer import ParallelTransformer
from .transformer import TransformerHyperparameters from megatron.model.utils import gelu
from .utils import gelu from megatron.model.utils import get_linear_layer
from .utils import get_linear_layer
def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
...@@ -40,52 +40,20 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, ...@@ -40,52 +40,20 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
# Gather if needed. # Gather if needed.
if parallel_output: if parallel_output:
return logits_parallel return logits_parallel
else:
return mpu.gather_from_model_parallel_region(logits_parallel) return mpu.gather_from_model_parallel_region(logits_parallel)
def get_language_model(num_layers, def get_language_model(attention_mask_func, num_tokentypes, add_pooler,
vocab_size, init_method, scaled_init_method):
hidden_size, """Build language model and return along with the key to save."""
num_attention_heads,
embedding_dropout_prob,
attention_dropout_prob,
output_dropout_prob,
max_sequence_length,
num_tokentypes,
attention_mask_func,
add_pooler,
checkpoint_activations,
checkpoint_num_layers,
layernorm_epsilon,
init_method,
scaled_init_method,
residual_connection_post_layernorm,
apply_query_key_layer_scaling,
attention_softmax_in_fp32):
# Transformer hyperparameters.
transformer_hparams = TransformerHyperparameters(
hidden_size=hidden_size,
num_layers=num_layers,
num_attention_heads=num_attention_heads,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
mlp_activation_func=gelu,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method,
output_layer_init_method=scaled_init_method,
checkpoint_activations=checkpoint_activations,
checkpoint_num_layers=checkpoint_num_layers,
apply_residual_connection_post_layernorm=residual_connection_post_layernorm,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
attention_softmax_in_fp32=attention_softmax_in_fp32)
# Language model. # Language model.
language_model = TransformerLanguageModel( language_model = TransformerLanguageModel(
transformer_hparams=transformer_hparams,
attention_mask_func=attention_mask_func, attention_mask_func=attention_mask_func,
vocab_size=vocab_size, mlp_activation_func=gelu,
max_sequence_length=max_sequence_length, init_method=init_method,
embedding_dropout_prob=embedding_dropout_prob, output_layer_init_method=scaled_init_method,
num_tokentypes=num_tokentypes, num_tokentypes=num_tokentypes,
add_pooler=add_pooler) add_pooler=add_pooler)
# key used for checkpoints. # key used for checkpoints.
...@@ -293,33 +261,33 @@ class TransformerLanguageModel(MegatronModule): ...@@ -293,33 +261,33 @@ class TransformerLanguageModel(MegatronModule):
will ignore this embedding will ignore this embedding
""" """
def __init__(self, def __init__(self,
transformer_hparams,
attention_mask_func, attention_mask_func,
vocab_size, mlp_activation_func,
max_sequence_length, init_method,
embedding_dropout_prob, output_layer_init_method,
num_tokentypes=0, num_tokentypes=0,
add_pooler=False): add_pooler=False):
super(TransformerLanguageModel, self).__init__() super(TransformerLanguageModel, self).__init__()
args = get_args()
self.hidden_size = transformer_hparams['hidden_size'] self.hidden_size = args.hidden_size
self.num_tokentypes = num_tokentypes self.num_tokentypes = num_tokentypes
self.init_method = transformer_hparams['init_method'] self.init_method = init_method
self.add_pooler = add_pooler self.add_pooler = add_pooler
# Embeddings # Embeddings
self.embedding = Embedding(self.hidden_size, self.embedding = Embedding(self.hidden_size,
vocab_size, args.padded_vocab_size,
max_sequence_length, args.max_position_embeddings,
embedding_dropout_prob, args.hidden_dropout,
self.init_method, self.init_method,
self.num_tokentypes) self.num_tokentypes)
self._embedding_key = 'embedding' self._embedding_key = 'embedding'
# Transformer # Transformer
self.transformer = ParallelTransformer( self.transformer = ParallelTransformer(
transformer_hparams, attention_mask_func, mlp_activation_func,
attention_mask_func) self.init_method, output_layer_init_method)
self._transformer_key = 'transformer' self._transformer_key = 'transformer'
# Pooler # Pooler
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import torch import torch
from megatron import get_args
from megatron.model.bert_model import bert_attention_mask_func from megatron.model.bert_model import bert_attention_mask_func
from megatron.model.bert_model import bert_extended_attention_mask from megatron.model.bert_model import bert_extended_attention_mask
from megatron.model.bert_model import bert_position_ids from megatron.model.bert_model import bert_position_ids
...@@ -25,57 +26,29 @@ from megatron.model.utils import get_linear_layer ...@@ -25,57 +26,29 @@ from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
from megatron.model.utils import scaled_init_method_normal from megatron.model.utils import scaled_init_method_normal
from megatron.module import MegatronModule from megatron.module import MegatronModule
from megatron.utils import print_rank_0 from megatron import print_rank_0
class MultipleChoice(MegatronModule): class MultipleChoice(MegatronModule):
def __init__(self, def __init__(self, num_tokentypes=2):
num_layers,
vocab_size,
hidden_size,
num_attention_heads,
embedding_dropout_prob,
attention_dropout_prob,
output_dropout_prob,
max_sequence_length,
checkpoint_activations,
checkpoint_num_layers=1,
layernorm_epsilon=1.0e-5,
init_method_std=0.02,
num_tokentypes=2,
apply_query_key_layer_scaling=False,
attention_softmax_in_fp32=False):
super(MultipleChoice, self).__init__() super(MultipleChoice, self).__init__()
args = get_args()
init_method = init_method_normal(init_method_std) init_method = init_method_normal(args.init_method_std)
self.language_model, self._language_model_key = get_language_model( self.language_model, self._language_model_key = get_language_model(
num_layers=num_layers, attention_mask_func=bert_attention_mask_func,
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=num_attention_heads,
embedding_dropout_prob=embedding_dropout_prob,
attention_dropout_prob=attention_dropout_prob,
output_dropout_prob=output_dropout_prob,
max_sequence_length=max_sequence_length,
num_tokentypes=num_tokentypes, num_tokentypes=num_tokentypes,
add_pooler=True, add_pooler=True,
attention_mask_func=bert_attention_mask_func,
checkpoint_activations=checkpoint_activations,
checkpoint_num_layers=checkpoint_num_layers,
layernorm_epsilon=layernorm_epsilon,
init_method=init_method, init_method=init_method,
scaled_init_method=scaled_init_method_normal(init_method_std, scaled_init_method=scaled_init_method_normal(args.init_method_std,
num_layers), args.num_layers))
residual_connection_post_layernorm=False,
apply_query_key_layer_scaling=apply_query_key_layer_scaling,
attention_softmax_in_fp32=attention_softmax_in_fp32)
# Multi-choice head. # Multi-choice head.
self.multichoice_dropout = torch.nn.Dropout(output_dropout_prob) self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
self.multichoice_head = get_linear_layer(hidden_size, 1, init_method) self.multichoice_head = get_linear_layer(args.hidden_size, 1,
init_method)
self._multichoice_head_key = 'multichoice_head' self._multichoice_head_key = 'multichoice_head'
......
...@@ -20,6 +20,7 @@ import math ...@@ -20,6 +20,7 @@ import math
import torch import torch
from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.module import MegatronModule from megatron.module import MegatronModule
...@@ -45,85 +46,6 @@ from megatron.module import MegatronModule ...@@ -45,85 +46,6 @@ from megatron.module import MegatronModule
unmaksed-attention-scores, attention-mask) unmaksed-attention-scores, attention-mask)
""" """
class TransformerHyperparameters:
"""Hyperparameters used to build and run the transformer.
Arguments:
hidden_size: hidden size (h)
num_layers: number of layers (l)
num_attention_heads: number of attention heads (n)
attention_dropout_prob: dropout probability for the attention
probabiliies
output_dropout_prob: dropout probability for the output
layers (attention output and mlp output)
mlp_activation_func: activation function for the mlp layer
layernorm_epsilon: tolerance parameters used for layer norm
dividions
init_method: init method used for all weights except layer
norm and output weights
output_layer_init_method: init method for output weights (
attention output and mlp output)
checkpoint_activations: flag to use activation checkpointing
checkpoint_num_layers: number of layers use in each chunk of
activation checkpointing
apply_residual_connection_post_layernorm: Take the post layer-norm
values for resudual connecton. BERT: True, GPT-2: False
"""
def __init__(self,
hidden_size=None,
num_layers=None,
num_attention_heads=None,
attention_dropout_prob=None,
output_dropout_prob=None,
mlp_activation_func=None,
layernorm_epsilon=None,
init_method=None,
output_layer_init_method=None,
checkpoint_activations=None,
checkpoint_num_layers=None,
apply_residual_connection_post_layernorm=None,
apply_query_key_layer_scaling=None,
attention_softmax_in_fp32=None):
self.params_dict = {}
self.params_dict['hidden_size'] = hidden_size
self.params_dict['num_layers'] = num_layers
self.params_dict['num_attention_heads'] = num_attention_heads
self.params_dict['attention_dropout_prob'] = attention_dropout_prob
self.params_dict['output_dropout_prob'] = output_dropout_prob
self.params_dict['mlp_activation_func'] = mlp_activation_func
self.params_dict['layernorm_epsilon'] = layernorm_epsilon
self.params_dict['init_method'] = init_method
self.params_dict['output_layer_init_method'] = output_layer_init_method
self.params_dict['checkpoint_activations'] = checkpoint_activations
self.params_dict['checkpoint_num_layers'] = checkpoint_num_layers
self.params_dict['apply_residual_connection_post_layernorm'] \
= apply_residual_connection_post_layernorm
self.params_dict['apply_query_key_layer_scaling'] \
= apply_query_key_layer_scaling
self.params_dict['attention_softmax_in_fp32'] \
= attention_softmax_in_fp32
def __getitem__(self, key):
"""Custom retrieval with error checks."""
try:
value = self.params_dict[key]
except KeyError:
raise Exception(
'could not find {} in transformer hyperparameters'.format(key))
except Exception as e:
print('unexpected error in transformer hyperparameters:', e)
raise Exception()
else:
assert value is not None, \
'parameter value for {} is not set in transformer '\
'hyperparameters'.format(key)
return value
raise Exception('should not be here')
class ParallelMLP(MegatronModule): class ParallelMLP(MegatronModule):
"""MLP. """MLP.
...@@ -133,26 +55,28 @@ class ParallelMLP(MegatronModule): ...@@ -133,26 +55,28 @@ class ParallelMLP(MegatronModule):
applied. applied.
""" """
def __init__(self, hyperparameters): def __init__(self, mlp_activation_func, init_method,
output_layer_init_method):
super(ParallelMLP, self).__init__() super(ParallelMLP, self).__init__()
args = get_args()
# Project to 4h. # Project to 4h.
self.dense_h_to_4h = mpu.ColumnParallelLinear( self.dense_h_to_4h = mpu.ColumnParallelLinear(
hyperparameters['hidden_size'], args.hidden_size,
4*hyperparameters['hidden_size'], 4*args.hidden_size,
gather_output=False, gather_output=False,
init_method=hyperparameters['init_method']) init_method=init_method)
self.activation_func = hyperparameters['mlp_activation_func'] self.activation_func = mlp_activation_func
# Project back to h. # Project back to h.
self.dense_4h_to_h = mpu.RowParallelLinear( self.dense_4h_to_h = mpu.RowParallelLinear(
4*hyperparameters['hidden_size'], 4*args.hidden_size,
hyperparameters['hidden_size'], args.hidden_size,
input_is_parallel=True, input_is_parallel=True,
init_method=hyperparameters['output_layer_init_method']) init_method=output_layer_init_method)
self.dropout = torch.nn.Dropout(hyperparameters['output_dropout_prob']) self.dropout = torch.nn.Dropout(args.hidden_dropout)
def forward(self, hidden_states): def forward(self, hidden_states):
...@@ -174,51 +98,47 @@ class ParallelSelfAttention(MegatronModule): ...@@ -174,51 +98,47 @@ class ParallelSelfAttention(MegatronModule):
Self-attention layer takes input with size [b, s, h] Self-attention layer takes input with size [b, s, h]
and returns output of the same size. and returns output of the same size.
""" """
def __init__(self, attention_mask_func, init_method,
def __init__(self, hyperparameters, attention_mask_func, layer_number): output_layer_init_method, layer_number):
super(ParallelSelfAttention, self).__init__() super(ParallelSelfAttention, self).__init__()
args = get_args()
self.attention_mask_func = attention_mask_func self.attention_mask_func = attention_mask_func
self.apply_query_key_layer_scaling \ self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
= hyperparameters['apply_query_key_layer_scaling'] self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
self.attention_softmax_in_fp32 \
= hyperparameters['attention_softmax_in_fp32']
if self.apply_query_key_layer_scaling: if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True self.attention_softmax_in_fp32 = True
self.layer_number = max(1, layer_number) self.layer_number = max(1, layer_number)
# Per attention head and per partition values. # Per attention head and per partition values.
world_size = mpu.get_model_parallel_world_size() world_size = mpu.get_model_parallel_world_size()
self.hidden_size_per_partition = mpu.divide( self.hidden_size_per_partition = mpu.divide(args.hidden_size,
hyperparameters['hidden_size'], world_size) world_size)
self.hidden_size_per_attention_head = mpu.divide( self.hidden_size_per_attention_head = mpu.divide(
hyperparameters['hidden_size'], args.hidden_size, args.num_attention_heads)
hyperparameters['num_attention_heads'])
self.num_attention_heads_per_partition = mpu.divide( self.num_attention_heads_per_partition = mpu.divide(
hyperparameters['num_attention_heads'], world_size) args.num_attention_heads, world_size)
# Strided linear layer. # Strided linear layer.
self.query_key_value = mpu.ColumnParallelLinear( self.query_key_value = mpu.ColumnParallelLinear(
hyperparameters['hidden_size'], args.hidden_size,
3*hyperparameters['hidden_size'], 3*args.hidden_size,
stride=3, stride=3,
gather_output=False, gather_output=False,
init_method=hyperparameters['init_method']) init_method=init_method)
# Dropout. Note that for a single iteration, this layer will generate # Dropout. Note that for a single iteration, this layer will generate
# different outputs on different number of parallel partitions but # different outputs on different number of parallel partitions but
# on average it should not be partition dependent. # on average it should not be partition dependent.
self.attention_dropout = torch.nn.Dropout( self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
hyperparameters['attention_dropout_prob'])
# Output. # Output.
self.dense = mpu.RowParallelLinear( self.dense = mpu.RowParallelLinear(
hyperparameters['hidden_size'], args.hidden_size,
hyperparameters['hidden_size'], args.hidden_size,
input_is_parallel=True, input_is_parallel=True,
init_method=hyperparameters['output_layer_init_method']) init_method=output_layer_init_method)
self.output_dropout = torch.nn.Dropout( self.output_dropout = torch.nn.Dropout(args.hidden_dropout)
hyperparameters['output_dropout_prob'])
def _transpose_for_scores(self, tensor): def _transpose_for_scores(self, tensor):
...@@ -369,30 +289,34 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -369,30 +289,34 @@ class ParallelTransformerLayer(MegatronModule):
Transformore layer takes input with size [b, s, h] and returns an Transformore layer takes input with size [b, s, h] and returns an
output of the same size. output of the same size.
""" """
def __init__(self, hyperparameters, attention_mask_func, layer_number): def __init__(self, attention_mask_func, mlp_activation_func,
init_method, output_layer_init_method, layer_number):
args = get_args()
super(ParallelTransformerLayer, self).__init__() super(ParallelTransformerLayer, self).__init__()
self.layer_number = layer_number self.layer_number = layer_number
self.apply_residual_connection_post_layernorm \ self.apply_residual_connection_post_layernorm \
= hyperparameters['apply_residual_connection_post_layernorm'] = args.apply_residual_connection_post_layernorm
# Layernorm on the input data. # Layernorm on the input data.
self.input_layernorm = LayerNorm( self.input_layernorm = LayerNorm(
hyperparameters['hidden_size'], args.hidden_size,
eps=hyperparameters['layernorm_epsilon']) eps=args.layernorm_epsilon)
# Self attention. # Self attention.
self.attention = ParallelSelfAttention( self.attention = ParallelSelfAttention(attention_mask_func, init_method,
hyperparameters, attention_mask_func, layer_number) output_layer_init_method,
layer_number)
# Layernorm on the input data. # Layernorm on the input data.
self.post_attention_layernorm = LayerNorm( self.post_attention_layernorm = LayerNorm(
hyperparameters['hidden_size'], args.hidden_size,
eps=hyperparameters['layernorm_epsilon']) eps=args.layernorm_epsilon)
# MLP # MLP
self.mlp = ParallelMLP(hyperparameters) self.mlp = ParallelMLP(mlp_activation_func, init_method,
output_layer_init_method)
def forward(self, hidden_states, attention_mask, layer_past=None, def forward(self, hidden_states, attention_mask, layer_past=None,
...@@ -434,25 +358,28 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -434,25 +358,28 @@ class ParallelTransformerLayer(MegatronModule):
class ParallelTransformer(MegatronModule): class ParallelTransformer(MegatronModule):
"""Transformer class.""" """Transformer class."""
def __init__(self, hyperparameters, attention_mask_func): def __init__(self, attention_mask_func, mlp_activation_func,
init_method, output_layer_init_method):
super(ParallelTransformer, self).__init__() super(ParallelTransformer, self).__init__()
args = get_args()
# Store activation checkpoiting flag. # Store activation checkpoiting flag.
self.checkpoint_activations = hyperparameters['checkpoint_activations'] self.checkpoint_activations = args.checkpoint_activations
self.checkpoint_num_layers = hyperparameters['checkpoint_num_layers'] self.checkpoint_num_layers = args.checkpoint_num_layers
def get_layer(layer_number): def get_layer(layer_number):
return ParallelTransformerLayer( return ParallelTransformerLayer(
hyperparameters, attention_mask_func, layer_number) attention_mask_func, mlp_activation_func,
init_method, output_layer_init_method, layer_number)
# Transformer layers. # Transformer layers.
self.layers = torch.nn.ModuleList( self.layers = torch.nn.ModuleList(
[get_layer(i+1) for i in range(hyperparameters['num_layers'])]) [get_layer(i+1) for i in range(args.num_layers)])
# Final layer norm before output. # Final layer norm before output.
self.final_layernorm = LayerNorm( self.final_layernorm = LayerNorm(
hyperparameters['hidden_size'], args.hidden_size,
eps=hyperparameters['layernorm_epsilon']) eps=args.layernorm_epsilon)
def _checkpointed_forward(self, hidden_states, attention_mask): def _checkpointed_forward(self, hidden_states, attention_mask):
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Megatron Module""" """Megatron Module"""
import torch import torch
......
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .tokenizer import build_tokenizer
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import sys
import json
import logging
import os
import regex as re
from io import open
try:
from functools import lru_cache
except ImportError:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
def lru_cache():
return lambda func: func
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
}
PRETRAINED_MERGES_ARCHIVE_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'gpt2': 1024,
}
VOCAB_NAME = 'vocab.json'
MERGES_NAME = 'merges.txt'
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
_chr = unichr if sys.version_info[0] == 2 else chr
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [_chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class GPT2Tokenizer(object):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level BPE
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
special_tokens_file = None
else:
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
if not os.path.exists(special_tokens_file):
special_tokens_file = None
else:
logger.info("loading special tokens file {}".format(special_tokens_file))
# redirect to the cache, if necessary
try:
from .file_utils import cached_path
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file, merges_file))
return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file))
logger.info("loading merges file {}".format(merges_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
logger.info("loading merges file {} from cache at {}".format(
merges_file, resolved_merges_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
if special_tokens_file and 'special_tokens' not in kwargs:
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
else:
special_tokens = kwargs.pop('special_tokens', [])
tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
return tokenizer
def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
def __len__(self):
return len(self.encoder) + len(self.special_tokens)
def set_special_tokens(self, special_tokens):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if not special_tokens:
self.special_tokens = {}
self.special_tokens_decoder = {}
return
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
logger.info("Special tokens {}".format(self.special_tokens))
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word
def tokenize(self, text):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
def convert_tokens_to_ids(self, tokens):
""" Converts a sequence of tokens into ids using the vocab. """
ids = []
if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
if tokens in self.special_tokens:
return self.special_tokens[tokens]
else:
return self.encoder.get(tokens, 0)
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
ids.append(self.encoder.get(token, 0))
if len(ids) > self.max_len:
logger.warning(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors".format(len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens = []
for i in ids:
if i in self.special_tokens_decoder:
if not skip_special_tokens:
tokens.append(self.special_tokens_decoder[i])
else:
tokens.append(self.decoder[i])
return tokens
def encode(self, text):
return self.convert_tokens_to_ids(self.tokenize(text))
def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(vocab_path):
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
return
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
merge_file = os.path.join(vocab_path, MERGES_NAME)
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
with open(vocab_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
index += 1
index = len(self.encoder)
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(special_tokens_file))
index = token_index
writer.write(token + u'\n')
index += 1
return vocab_file, merge_file, special_tokens_file
# coding=utf-8
"""Megatron tokenizer.""" # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizers."""
from abc import ABC from abc import ABC
from abc import abstractmethod from abc import abstractmethod
from megatron.utils import vocab_size_with_padding
from .bert_tokenization import FullTokenizer as FullBertTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer
def add_tokenizer_to_args(args, tokenizer_type): def build_tokenizer(args):
"""Instantiate tokenizer based on input type and add it to args.""" """Initialize tokenizer."""
if args.rank == 0:
print('> building {} tokenizer ...'.format(args.tokenizer_type),
flush=True)
# Make sure we have not already called this method.
if hasattr(args, 'tokenizer'):
raise Exception('args already has a tokenizer')
# Select and instantiate the tokenizer. # Select and instantiate the tokenizer.
if tokenizer_type == 'BertWordPieceLowerCase': assert args.vocab_file is not None
args.tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab, if args.tokenizer_type == 'BertWordPieceLowerCase':
lower_case=True) tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True)
elif args.tokenizer_type == 'GPT2BPETokenizer':
assert args.merge_file is not None
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
else: else:
raise NotImplementedError('{} tokenizer is not ' raise NotImplementedError('{} tokenizer is not '
'implemented.'.format(tokenizer_type)) 'implemented.'.format(args.tokenizer_type))
# Add vocab size. # Add vocab size.
args.vocab_size = vocab_size_with_padding(args.tokenizer.vocab_size, args) args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
args)
return tokenizer
def _vocab_size_with_padding(orig_vocab_size, args):
"""Pad vocab size so it is divisible by model parallel size and
still having GPU friendly size."""
after = orig_vocab_size
multiple = args.make_vocab_size_divisible_by * \
args.model_parallel_size
while (after % multiple) != 0:
after += 1
if args.rank == 0:
print(' > padded vocab (size: {}) with {} dummy tokens '
'(new size: {})'.format(
orig_vocab_size, after - orig_vocab_size, after), flush=True)
return after
class AbstractTokenizer(ABC): class AbstractTokenizer(ABC):
...@@ -39,6 +75,18 @@ class AbstractTokenizer(ABC): ...@@ -39,6 +75,18 @@ class AbstractTokenizer(ABC):
def vocab_size(self): def vocab_size(self):
pass pass
@property
@abstractmethod
def vocab(self):
"""Dictionary from vocab text token to id token."""
pass
@property
@abstractmethod
def inv_vocab(self):
"""Dictionary from vocab id token to text token."""
pass
@abstractmethod @abstractmethod
def tokenize(self, text): def tokenize(self, text):
pass pass
...@@ -63,6 +111,10 @@ class AbstractTokenizer(ABC): ...@@ -63,6 +111,10 @@ class AbstractTokenizer(ABC):
raise NotImplementedError('EOD is not provided for {} ' raise NotImplementedError('EOD is not provided for {} '
'tokenizer'.format(self.name)) 'tokenizer'.format(self.name))
@property
def mask(self):
raise NotImplementedError('MASK is not provided for {} '
'tokenizer'.format(self.name))
class _BertWordPieceTokenizer(AbstractTokenizer): class _BertWordPieceTokenizer(AbstractTokenizer):
...@@ -78,11 +130,20 @@ class _BertWordPieceTokenizer(AbstractTokenizer): ...@@ -78,11 +130,20 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
self.cls_id = self.tokenizer.vocab['[CLS]'] self.cls_id = self.tokenizer.vocab['[CLS]']
self.sep_id = self.tokenizer.vocab['[SEP]'] self.sep_id = self.tokenizer.vocab['[SEP]']
self.pad_id = self.tokenizer.vocab['[PAD]'] self.pad_id = self.tokenizer.vocab['[PAD]']
self.mask_id = self.tokenizer.vocab['[MASK]']
@property @property
def vocab_size(self): def vocab_size(self):
return self.tokenizer.vocab_size() return self.tokenizer.vocab_size()
@property
def vocab(self):
return self.tokenizer.vocab
@property
def inv_vocab(self):
return self.tokenizer.inv_vocab
def tokenize(self, text): def tokenize(self, text):
text_tokens = self.tokenizer.tokenize(text) text_tokens = self.tokenizer.tokenize(text)
return self.tokenizer.convert_tokens_to_ids(text_tokens) return self.tokenizer.convert_tokens_to_ids(text_tokens)
...@@ -98,3 +159,37 @@ class _BertWordPieceTokenizer(AbstractTokenizer): ...@@ -98,3 +159,37 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
@property @property
def pad(self): def pad(self):
return self.pad_id return self.pad_id
@property
def mask(self):
return self.mask_id
class _GPT2BPETokenizer(AbstractTokenizer):
"""Original GPT2 BPE tokenizer."""
def __init__(self, vocab_file, merge_file):
name = 'GPT2 BPE'
super().__init__(name)
self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
special_tokens=[], max_len=None)
self.eod_id = self.tokenizer.encoder['<|endoftext|>']
@property
def vocab_size(self):
return len(self.tokenizer.encoder)
@property
def vocab(self):
return self.tokenizer.encoder
@property
def inv_vocab(self):
return self.tokenizer.decoder
def tokenize(self, text):
return self.tokenizer.encode(text)
@property
def eod(self):
return self.eod_id
This diff is collapsed.
This diff is collapsed.
...@@ -18,55 +18,42 @@ ...@@ -18,55 +18,42 @@
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from megatron import get_args
from megatron import get_timers
from megatron import mpu from megatron import mpu
from megatron import print_rank_0
from megatron.data.bert_dataset import build_train_valid_test_datasets
from megatron.model import BertModel from megatron.model import BertModel
from megatron.utils import print_rank_0 from megatron.training import pretrain
from megatron.utils import make_data_loader
from megatron.utils import reduce_losses from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding
from megatron.training import run
from megatron.data.bert_dataset import build_train_valid_test_datasets
from megatron.data_utils.samplers import DistributedBatchSampler
def model_provider(args): def model_provider():
"""Build the model.""" """Build the model."""
args = get_args()
print_rank_0('building BERT model ...') print_rank_0('building BERT model ...')
model = BertModel( model = BertModel(
num_layers=args.num_layers, num_tokentypes=2,
vocab_size=args.vocab_size,
hidden_size=args.hidden_size,
num_attention_heads=args.num_attention_heads,
embedding_dropout_prob=args.hidden_dropout,
attention_dropout_prob=args.attention_dropout,
output_dropout_prob=args.hidden_dropout,
max_sequence_length=args.max_position_embeddings,
checkpoint_activations=args.checkpoint_activations,
checkpoint_num_layers=args.checkpoint_num_layers,
add_binary_head=True, add_binary_head=True,
layernorm_epsilon=args.layernorm_epsilon, parallel_output=True)
num_tokentypes=args.tokentype_size,
parallel_output=True,
apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
attention_softmax_in_fp32=args.attention_softmax_in_fp32)
return model return model
def get_batch(data_iterator, timers): def get_batch(data_iterator):
# Items and their type. # Items and their type.
keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask'] keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
datatype = torch.int64 datatype = torch.int64
# Broadcast data. # Broadcast data.
timers('data loader').start()
if data_iterator is not None: if data_iterator is not None:
data = next(data_iterator) data = next(data_iterator)
else: else:
data = None data = None
timers('data loader').stop()
data_b = mpu.broadcast_data(keys, data, datatype) data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack. # Unpack.
...@@ -80,13 +67,14 @@ def get_batch(data_iterator, timers): ...@@ -80,13 +67,14 @@ def get_batch(data_iterator, timers):
return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
def forward_step(data_iterator, model, args, timers): def forward_step(data_iterator, model):
"""Forward step.""" """Forward step."""
timers = get_timers()
# Get the batch. # Get the batch.
timers('batch generator').start() timers('batch generator').start()
tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \ tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \
= get_batch(data_iterator, timers) = get_batch(data_iterator)
timers('batch generator').stop() timers('batch generator').stop()
# Forward model. # Forward model.
...@@ -108,8 +96,9 @@ def forward_step(data_iterator, model, args, timers): ...@@ -108,8 +96,9 @@ def forward_step(data_iterator, model, args, timers):
return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]} return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
def get_train_val_test_data(args): def get_train_val_test_data():
"""Load the data on rank zero and boradcast number of tokens to all GPUS.""" """Load the data on rank zero and boradcast number of tokens to all GPUS."""
args = get_args()
(train_data, valid_data, test_data) = (None, None, None) (train_data, valid_data, test_data) = (None, None, None)
...@@ -118,17 +107,6 @@ def get_train_val_test_data(args): ...@@ -118,17 +107,6 @@ def get_train_val_test_data(args):
print_rank_0('> building train, validation, and test datasets ' print_rank_0('> building train, validation, and test datasets '
'for BERT ...') 'for BERT ...')
if args.data_loader is None:
args.data_loader = 'binary'
if args.data_loader != 'binary':
print('Unsupported {} data loader for BERT.'.format(
args.data_loader))
exit(1)
if not args.data_path:
print('BERT only supports a unified dataset specified '
'with --data-path')
exit(1)
data_parallel_size = mpu.get_data_parallel_world_size() data_parallel_size = mpu.get_data_parallel_world_size()
data_parallel_rank = mpu.get_data_parallel_rank() data_parallel_rank = mpu.get_data_parallel_rank()
global_batch_size = args.batch_size * data_parallel_size global_batch_size = args.batch_size * data_parallel_size
...@@ -137,7 +115,7 @@ def get_train_val_test_data(args): ...@@ -137,7 +115,7 @@ def get_train_val_test_data(args):
train_iters = args.train_iters train_iters = args.train_iters
eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters eval_iters = (train_iters // args.eval_interval + 1) * args.eval_iters
test_iters = args.eval_iters test_iters = args.eval_iters
train_val_test_num_samples = [args.train_iters * global_batch_size, train_val_test_num_samples = [train_iters * global_batch_size,
eval_iters * global_batch_size, eval_iters * global_batch_size,
test_iters * global_batch_size] test_iters * global_batch_size]
print_rank_0(' > datasets target sizes (minimum size):') print_rank_0(' > datasets target sizes (minimum size):')
...@@ -145,10 +123,8 @@ def get_train_val_test_data(args): ...@@ -145,10 +123,8 @@ def get_train_val_test_data(args):
print_rank_0(' validation: {}'.format(train_val_test_num_samples[1])) print_rank_0(' validation: {}'.format(train_val_test_num_samples[1]))
print_rank_0(' test: {}'.format(train_val_test_num_samples[2])) print_rank_0(' test: {}'.format(train_val_test_num_samples[2]))
assert len(args.data_path) == 1
train_ds, valid_ds, test_ds = build_train_valid_test_datasets( train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
vocab_file=args.vocab, data_prefix=args.data_path,
data_prefix=args.data_path[0],
data_impl=args.data_impl, data_impl=args.data_impl,
splits_string=args.split, splits_string=args.split,
train_valid_test_num_samples=train_val_test_num_samples, train_valid_test_num_samples=train_val_test_num_samples,
...@@ -156,57 +132,34 @@ def get_train_val_test_data(args): ...@@ -156,57 +132,34 @@ def get_train_val_test_data(args):
masked_lm_prob=args.mask_prob, masked_lm_prob=args.mask_prob,
short_seq_prob=args.short_seq_prob, short_seq_prob=args.short_seq_prob,
seed=args.seed, seed=args.seed,
skip_warmup=args.skip_mmap_warmup) skip_warmup=(not args.mmap_warmup))
print_rank_0("> finished creating BERT datasets ...") print_rank_0("> finished creating BERT datasets ...")
def make_data_loader_(dataset): train_data = make_data_loader(train_ds)
if not dataset: valid_data = make_data_loader(valid_ds)
return None test_data = make_data_loader(test_ds)
# Use a simple sampler with distributed batch sampler.
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(
sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=data_parallel_rank,
world_size=data_parallel_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=args.num_workers,
pin_memory=True)
train_data = make_data_loader_(train_ds)
valid_data = make_data_loader_(valid_ds)
test_data = make_data_loader_(test_ds)
do_train = train_data is not None and args.train_iters > 0 do_train = train_data is not None and args.train_iters > 0
do_valid = valid_data is not None and args.eval_iters > 0 do_valid = valid_data is not None and args.eval_iters > 0
do_test = test_data is not None and args.eval_iters > 0 do_test = test_data is not None and args.eval_iters > 0
# Need to broadcast num_tokens and num_type_tokens. # Need to broadcast num_tokens and num_type_tokens.
num_tokens = vocab_size_with_padding(train_ds.num_tokens(), args) flags = torch.cuda.LongTensor(
token_counts = torch.cuda.LongTensor([num_tokens, [int(do_train), int(do_valid), int(do_test)])
2, # hard coded num_type_tokens
int(do_train),
int(do_valid),
int(do_test)])
else: else:
token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) flags = torch.cuda.LongTensor([0, 0, 0])
# Broadcast num tokens. # Broadcast num tokens.
torch.distributed.broadcast(token_counts, torch.distributed.broadcast(flags,
mpu.get_model_parallel_src_rank(), mpu.get_model_parallel_src_rank(),
group=mpu.get_model_parallel_group()) group=mpu.get_model_parallel_group())
args.vocab_size = token_counts[0].item() args.do_train = flags[0].item()
args.tokentype_size = token_counts[1].item() args.do_valid = flags[1].item()
args.do_train = token_counts[2].item() args.do_test = flags[2].item()
args.do_valid = token_counts[3].item()
args.do_test = token_counts[4].item()
return train_data, valid_data, test_data return train_data, valid_data, test_data
if __name__ == "__main__": if __name__ == "__main__":
run('Pretrain BERT model', get_train_val_test_data, pretrain(get_train_val_test_data, model_provider, forward_step,
model_provider, forward_step) args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
...@@ -15,55 +15,46 @@ ...@@ -15,55 +15,46 @@
"""Pretrain GPT2""" """Pretrain GPT2"""
import os
import torch import torch
from configure_data import configure_data from megatron import get_args
from gpt2_data_loader import make_gpt2_dataloaders from megatron import get_timers
from megatron import get_tokenizer
from megatron import mpu from megatron import mpu
from megatron import print_rank_0
from megatron.data.gpt2_dataset import GPT2Dataset
from megatron.model import GPT2Model from megatron.model import GPT2Model
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import print_rank_0 from megatron.utils import make_data_loader
from megatron.utils import reduce_losses from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding
from megatron.training import run
def model_provider(args): def model_provider():
"""Build the model.""" """Build the model."""
args = get_args()
print_rank_0('building GPT2 model ...') print_rank_0('building GPT2 model ...')
model = GPT2Model(num_layers=args.num_layers, model = GPT2Model(num_tokentypes=0, parallel_output=True)
vocab_size=args.vocab_size,
hidden_size=args.hidden_size,
num_attention_heads=args.num_attention_heads,
embedding_dropout_prob=args.hidden_dropout,
attention_dropout_prob=args.attention_dropout,
output_dropout_prob=args.hidden_dropout,
max_sequence_length=args.max_position_embeddings,
checkpoint_activations=args.checkpoint_activations,
checkpoint_num_layers=args.checkpoint_num_layers,
layernorm_epsilon=args.layernorm_epsilon,
parallel_output=True,
apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
attention_softmax_in_fp32=args.attention_softmax_in_fp32)
return model return model
def get_batch(data_iterator, args, timers): def get_batch(data_iterator):
"""Generate a batch""" """Generate a batch"""
args = get_args()
# Items and their type. # Items and their type.
keys = ['text'] keys = ['text']
datatype = torch.int64 datatype = torch.int64
# Broadcast data. # Broadcast data.
timers('data loader').start()
if data_iterator is not None: if data_iterator is not None:
data = next(data_iterator) data = next(data_iterator)
else: else:
data = None data = None
timers('data loader').stop()
data_b = mpu.broadcast_data(keys, data, datatype) data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack. # Unpack.
...@@ -85,13 +76,14 @@ def get_batch(data_iterator, args, timers): ...@@ -85,13 +76,14 @@ def get_batch(data_iterator, args, timers):
return tokens, labels, loss_mask, attention_mask, position_ids return tokens, labels, loss_mask, attention_mask, position_ids
def forward_step(data_iterator, model, args, timers): def forward_step(data_iterator, model):
"""Forward step.""" """Forward step."""
timers = get_timers()
# Get the batch. # Get the batch.
timers('batch generator').start() timers('batch generator').start()
tokens, labels, loss_mask, attention_mask, position_ids = get_batch( tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator, args, timers) data_iterator)
timers('batch generator').stop() timers('batch generator').stop()
# Forward model. # Forward model.
...@@ -107,60 +99,74 @@ def forward_step(data_iterator, model, args, timers): ...@@ -107,60 +99,74 @@ def forward_step(data_iterator, model, args, timers):
return loss, {'lm loss': reduced_loss[0]} return loss, {'lm loss': reduced_loss[0]}
def get_train_val_test_data(args): def make_gpt2_dataloaders():
"""Build gpt2 dataloders."""
args = get_args()
# Input parameters.
input_data_sizes_file = args.input_data_sizes_file
seq_length = args.seq_length
initial_seed = args.seed
# Build the datasets.
def _build_dataset(name):
return GPT2Dataset(os.path.join(args.data_path, name),
args.input_data_sizes_file,
args.seq_length, args.seed)
train_ds = _build_dataset('train')
valid_ds = _build_dataset('valid')
test_ds = _build_dataset('test')
# Dataloaders
train = make_data_loader(train_ds)
valid = make_data_loader(valid_ds)
test = make_data_loader(test_ds)
args.do_train = False
args.do_valid = False
args.do_test = False
if train is not None:
args.do_train = True
if valid is not None:
args.do_valid = True
if test is not None:
args.do_test = True
return (train, valid, test)
def get_train_val_test_data():
"""Load the data on rank zero and boradcast number of tokens to all GPUS.""" """Load the data on rank zero and boradcast number of tokens to all GPUS."""
args = get_args()
(train_data, val_data, test_data) = (None, None, None) (train_data, val_data, test_data) = (None, None, None)
# Data loader only on rank 0 of each model parallel group. # Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0: if mpu.get_model_parallel_rank() == 0:
if args.data_loader == 'numpy':
assert len(args.train_data) == 1 (train_data, val_data, test_data) = make_gpt2_dataloaders()
args.train_data = args.train_data[0] flags = torch.cuda.LongTensor([int(args.do_train),
assert len(args.valid_data) == 1 int(args.do_valid),
args.valid_data = args.valid_data[0] int(args.do_test)])
assert len(args.test_data) == 1
args.test_data = args.test_data[0]
(train_data, val_data, test_data), num_tokens, \
eod_token = make_gpt2_dataloaders(args)
elif args.data_loader == 'raw' or args.data_loader == 'lazy':
data_config = configure_data()
data_config.set_defaults(data_set_type='GPT2', transpose=False)
(train_data, val_data, test_data), tokenizer = data_config.apply(
args)
num_tokens = tokenizer.num_tokens
eod_token = tokenizer.get_command('eos').Id
assert eod_token == tokenizer.get_command('pad').Id
else:
print("Unsupported data loader for GPT2.")
exit(1)
# pad.
num_tokens = vocab_size_with_padding(num_tokens, args)
print_rank_0('> found end-of-document token: {}'.format(eod_token))
token_counts = torch.cuda.LongTensor([num_tokens, eod_token,
int(args.do_train),
int(args.do_valid),
int(args.do_test)])
else: else:
token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) flags = torch.cuda.LongTensor([0, 0, 0])
# Broadcast num tokens. # Broadcast num tokens.
torch.distributed.broadcast(token_counts, torch.distributed.broadcast(flags,
mpu.get_model_parallel_src_rank(), mpu.get_model_parallel_src_rank(),
group=mpu.get_model_parallel_group()) group=mpu.get_model_parallel_group())
num_tokens = token_counts[0].item() args.do_train = flags[0].item()
eod_token = token_counts[1].item() args.do_valid = flags[1].item()
args.do_train = token_counts[2].item() args.do_test = flags[2].item()
args.do_valid = token_counts[3].item()
args.do_test = token_counts[4].item()
args.vocab_size = num_tokens tokenizer = get_tokenizer()
args.eod_token = eod_token args.eod_token = tokenizer.eod_id
return train_data, val_data, test_data return train_data, val_data, test_data
if __name__ == "__main__": if __name__ == "__main__":
run('Pretrain GPT-2 model', get_train_val_test_data, pretrain(get_train_val_test_data, model_provider, forward_step,
model_provider, forward_step) args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
import os
import os.path as osp
import pathlib
import subprocess
def recursively_lint_files():
"""Recursively lint all python files in chosen subdirectories of megatron-lm"""
try:
import autopep8
except ModuleNotFoundError:
print("Please first install autopep8 via `pip install autopep8`")
return
# get all python file paths from top level directory
file_dir = str(pathlib.Path(__file__).parent.absolute())
working_dir = osp.join(file_dir, os.pardir)
all_py_paths = set(os.path.join(working_dir, fname)
for fname in os.listdir(working_dir) if ".py" in fname)
# get all python file paths from chosen subdirectories
check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
for sub_dir in check_dirs:
for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
print("Linting the following: ")
for py_path in all_py_paths:
print(py_path)
command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
subprocess.check_call(command)
if __name__ == "__main__":
recursively_lint_files()
...@@ -20,26 +20,28 @@ import time ...@@ -20,26 +20,28 @@ import time
import torch import torch
from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.utils import print_rank_0 from megatron import print_rank_0
from .finetune_utils import build_data_loader from tasks.finetune_utils import build_data_loader
from .finetune_utils import process_batch from tasks.finetune_utils import process_batch
def accuracy_func_provider(args, single_dataset_provider): def accuracy_func_provider(single_dataset_provider):
"""Provide function that calculates accuracies.""" """Provide function that calculates accuracies."""
args = get_args()
# Build dataloaders. # Build dataloaders.
datapaths = args.valid_data datapaths = args.valid_data
dataloaders = [] dataloaders = []
for datapath in datapaths: for datapath in datapaths:
dataset = single_dataset_provider(datapath, args) dataset = single_dataset_provider(datapath)
dataloader = build_data_loader( dataloader = build_data_loader(
dataset, args.batch_size, num_workers=args.num_workers, dataset, args.batch_size, num_workers=args.num_workers,
drop_last=(mpu.get_data_parallel_world_size() > 1)) drop_last=(mpu.get_data_parallel_world_size() > 1))
dataloaders.append((dataset.dataset_name, dataloader)) dataloaders.append((dataset.dataset_name, dataloader))
def metrics_func(model, args_, epoch, output_predictions=False): def metrics_func(model, epoch, output_predictions=False):
print_rank_0('calculating metrics ...') print_rank_0('calculating metrics ...')
correct = 0 correct = 0
total = 0 total = 0
...@@ -48,7 +50,7 @@ def accuracy_func_provider(args, single_dataset_provider): ...@@ -48,7 +50,7 @@ def accuracy_func_provider(args, single_dataset_provider):
named_predictions = [] named_predictions = []
names = 'predictions' names = 'predictions'
for name, dataloader in dataloaders: for name, dataloader in dataloaders:
output = calculate_correct_answers(name, model, dataloader, args_, output = calculate_correct_answers(name, model, dataloader,
epoch, output_predictions) epoch, output_predictions)
if not output_predictions: if not output_predictions:
correct_ans, total_count = output correct_ans, total_count = output
...@@ -70,7 +72,7 @@ def accuracy_func_provider(args, single_dataset_provider): ...@@ -70,7 +72,7 @@ def accuracy_func_provider(args, single_dataset_provider):
return metrics_func return metrics_func
def calculate_correct_answers(name, model, dataloader, args, def calculate_correct_answers(name, model, dataloader,
epoch, output_predictions): epoch, output_predictions):
"""Calculate correct over total answers and return prediction if the """Calculate correct over total answers and return prediction if the
`output_predictions` is true.""" `output_predictions` is true."""
...@@ -89,7 +91,7 @@ def calculate_correct_answers(name, model, dataloader, args, ...@@ -89,7 +91,7 @@ def calculate_correct_answers(name, model, dataloader, args,
ids = [] ids = []
for _, batch in enumerate(dataloader): for _, batch in enumerate(dataloader):
# Run the model forward. # Run the model forward.
tokens, types, labels_, attention_mask = process_batch(batch, args) tokens, types, labels_, attention_mask = process_batch(batch)
logits = model(tokens, attention_mask, types) logits = model(tokens, attention_mask, types)
# Add output predictions. # Add output predictions.
if output_predictions: if output_predictions:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment