Commit b84d7a90 authored by mohammad's avatar mohammad
Browse files

moved module to model and removed fp16

parent 97ba5c0e
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fp16util import (
BN_convert_float,
network_to_half,
prep_param_lists,
model_grads_to_master_grads,
master_params_to_model_params,
tofp16,
to_python_float,
clip_grad_norm,
convert_module,
convert_network,
FP16Model,
)
from .fp16 import *
from .loss_scaler import *
This diff is collapsed.
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
from megatron import mpu
class tofp16(nn.Module):
"""
Utility module that implements::
def forward(self, input):
return input.half()
"""
def __init__(self):
super(tofp16, self).__init__()
def forward(self, input):
return input.half()
def BN_convert_float(module):
"""
Utility function for network_to_half().
Retained for legacy purposes.
"""
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
module.float()
for child in module.children():
BN_convert_float(child)
return module
def network_to_half(network):
"""
Convert model to half precision in a batchnorm-safe way.
Retained for legacy purposes. It is recommended to use FP16Model.
"""
return nn.Sequential(tofp16(), BN_convert_float(network.half()))
def convert_module(module, dtype):
"""
Converts a module's immediate parameters and buffers to dtype.
"""
for param in module.parameters(recurse=False):
if param is not None:
if param.data.dtype.is_floating_point:
param.data = param.data.to(dtype=dtype)
if param._grad is not None and param._grad.data.dtype.is_floating_point:
param._grad.data = param._grad.data.to(dtype=dtype)
for buf in module.buffers(recurse=False):
if buf is not None and buf.data.dtype.is_floating_point:
buf.data = buf.data.to(dtype=dtype)
def convert_network(network, dtype):
"""
Converts a network's parameters and buffers to dtype.
"""
for module in network.modules():
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
continue
convert_module(module, dtype)
return network
class FP16Model(nn.Module):
"""
Convert model to half precision in a batchnorm-safe way.
"""
def __init__(self, network):
super(FP16Model, self).__init__()
self.network = convert_network(network, dtype=torch.half)
def forward(self, *inputs):
inputs = tuple(t.half() for t in inputs)
return self.network(*inputs)
def backwards_debug_hook(grad):
raise RuntimeError("master_params recieved a gradient in the backward pass!")
def prep_param_lists(model, flat_master=False):
"""
Creates a list of FP32 master parameters for a given model, as in
`Training Neural Networks with Mixed Precision: Real Examples`_.
Args:
model (torch.nn.Module): Existing Pytorch model
flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization.
Returns:
A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element.
Example::
model_params, master_params = prep_param_lists(model)
.. warning::
Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
.. _`Training Neural Networks with Mixed Precision: Real Examples`:
http://on-demand.gputechconf.com/gtc/2018/video/S81012/
"""
model_params = [param for param in model.parameters() if param.requires_grad]
if flat_master:
# Give the user some more useful error messages
try:
# flatten_dense_tensors returns a contiguous flat array.
# http://pytorch.org/docs/master/_modules/torch/_utils.html
master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
except BaseException:
print("Error in prep_param_lists: model may contain a mixture of parameters "
"of different types. Use flat_master=False, or use F16_Optimizer.")
raise
master_params = torch.nn.Parameter(master_params)
master_params.requires_grad = True
# master_params.register_hook(backwards_debug_hook)
if master_params.grad is None:
master_params.grad = master_params.new(*master_params.size())
return model_params, [master_params]
else:
master_params = [param.clone().float().detach() for param in model_params]
for param in master_params:
param.requires_grad = True
return model_params, master_params
def model_grads_to_master_grads(model_params, master_params, flat_master=False):
"""
Copy model gradients to master gradients.
Args:
model_params: List of model parameters created by :func:`prep_param_lists`.
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
"""
if flat_master:
# The flattening may incur one more deep copy than is necessary.
master_params[0].grad.data.copy_(
_flatten_dense_tensors([p.grad.data for p in model_params]))
else:
for model, master in zip(model_params, master_params):
if model.grad is not None:
if master.grad is None:
master.grad = Variable(master.data.new(*master.data.size()))
else:
master.grad = None
model_grads = [p.grad for p in model_params if p.grad is not None]
master_grads = [p.grad for p in master_params if p.grad is not None]
_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(amp_C.multi_tensor_scale,
_overflow_buf,
[model_grads, master_grads],
1.0)
def master_params_to_model_params(model_params, master_params, flat_master=False):
"""
Copy master parameters to model parameters.
Args:
model_params: List of model parameters created by :func:`prep_param_lists`.
master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
"""
if flat_master:
for model, master in zip(model_params,
_unflatten_dense_tensors(master_params[0].data, model_params)):
model.data.copy_(master)
else:
for model, master in zip(model_params, master_params):
model.data.copy_(master.data)
# Backward compatibility fixes
def to_python_float(t):
if hasattr(t, 'item'):
return t.item()
else:
return t[0]
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
clip_grad_norm = None #mpu.clip_grad_norm
# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
# clip_grad_norm = torch.nn.utils.clip_grad_norm
# else:
# clip_grad_norm = torch.nn.utils.clip_grad_norm_
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
from megatron import mpu
# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
if hasattr(t, 'item'):
return t.item()
else:
return t[0]
class LossScaler:
"""
Class that manages a static loss scale. This class is intended to interact with
:class:`FP16_Optimizer`, and should not be directly manipulated by the user.
Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
:class:`FP16_Optimizer`'s constructor.
Args:
scale (float, optional, default=1.0): The loss scale.
"""
def __init__(self, scale=1):
self.cur_scale = scale
# `params` is a list / generator of torch.Variable
def has_overflow(self, params):
return False
# `x` is a torch.Tensor
def _has_inf_or_nan(x):
return False
def update_scale(self, overflow):
pass
@property
def loss_scale(self):
return self.cur_scale
def scale_gradient(self, module, grad_in, grad_out):
_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(amp_C.multi_tensor_scale,
_overflow_buf,
[grad_in, grad_in],
self.loss_scale)
return grad_in
def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
# If output_tensor_grad is None, this is the last stage, and
# output_tensor is actually the loss and needs to be scaled.
# Otherwise, output_tensor does not need to be scaled again since
# output_tensor_grad is already scaled.
if output_tensor_grad is None:
scaled_output_tensor = output_tensor * self.loss_scale
else:
scaled_output_tensor = output_tensor
torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
retain_graph=retain_graph)
class DynamicLossScaler:
"""
Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler`
indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
:class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler`
operates, because the default options can be changed using the
the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
Loss scaling is designed to combat the problem of underflowing gradients encountered at long
times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are
encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
occurred.
:class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
If a certain number of iterations occur without overflowing gradients detected,
:class:`DynamicLossScaler` increases the loss scale once more.
In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
always using the highest loss scale possible without incurring overflow.
Args:
init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
"""
def __init__(self,
init_scale=2**32,
scale_factor=2.,
scale_window=1000,
min_scale=1,
delayed_shift=1,
consecutive_hysteresis=False):
self.cur_scale = init_scale
self.cur_iter = 0
self.last_overflow_iter = -1
self.scale_factor = scale_factor
self.scale_window = scale_window
self.min_scale = min_scale
self.delayed_shift = delayed_shift
self.cur_hysteresis = delayed_shift
self.consecutive_hysteresis = consecutive_hysteresis
# `params` is a list / generator of torch.Variable
def has_overflow_serial(self, params):
for p in params:
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
return True
return False
def has_overflow(self, params):
overflow = self.has_overflow_serial(params)
# Since each model parallel GPU carries only part of the model,
# make sure overflow flag is synced across all the model parallel GPUs
overflow_gpu = torch.cuda.ByteTensor([overflow])
torch.distributed.all_reduce(overflow_gpu,
op=torch.distributed.ReduceOp.MAX,
group=mpu.get_model_parallel_group())
overflow = overflow_gpu[0].item()
return bool(overflow)
# `x` is a torch.Tensor
def _has_inf_or_nan(x):
try:
# if x is half, the .float() incurs an additional deep copy, but it's necessary if
# Pytorch's .sum() creates a one-element tensor of the same type as x
# (which is true for some recent version of pytorch).
cpu_sum = float(x.float().sum())
# More efficient version that can be used if .sum() returns a Python scalar
# cpu_sum = float(x.sum())
except RuntimeError as instance:
# We want to check if inst is actually an overflow exception.
# RuntimeError could come from a different error.
# If so, we still want the exception to propagate.
if "value cannot be converted" not in instance.args[0]:
raise
return True
else:
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True
return False
# `overflow` is boolean indicating whether the gradient overflowed
def update_scale(self, overflow):
if not hasattr(self, 'min_scale'):
self.min_scale = 1
if not hasattr(self, 'delayed_shift'):
self.delayed_shift = 1
if not hasattr(self, 'cur_hysteresis'):
self.cur_hysteresis = 1
if not hasattr(self, 'consecutive_hysteresis'):
self.consecutive_hysteresis = True
if overflow:
# self.cur_scale /= self.scale_factor
if self.delayed_shift == 1 or self.cur_hysteresis == 1:
self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
else:
self.cur_hysteresis -= 1
self.last_overflow_iter = self.cur_iter
else:
if self.consecutive_hysteresis:
self.cur_hysteresis = self.delayed_shift
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
if not self.consecutive_hysteresis:
self.cur_hysteresis = self.delayed_shift
self.cur_scale *= self.scale_factor
self.cur_iter += 1
@property
def loss_scale(self):
return self.cur_scale
def scale_gradient(self, module, grad_in, grad_out):
_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(amp_C.multi_tensor_scale,
_overflow_buf,
[grad_in, grad_in],
self.loss_scale)
return grad_in
def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
# If output_tensor_grad is None, this is the last stage, and
# output_tensor is actually the loss and needs to be scaled.
# Otherwise, output_tensor does not need to be scaled again since
# output_tensor_grad is already scaled.
if output_tensor_grad is None:
scaled_output_tensor = output_tensor * self.loss_scale
else:
scaled_output_tensor = output_tensor
torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
retain_graph=retain_graph)
##############################################################
# Example usage below here -- assuming it's in a separate file
##############################################################
"""
TO-DO separate out into an example.
if __name__ == "__main__":
import torch
from torch.autograd import Variable
from dynamic_loss_scaler import DynamicLossScaler
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in), requires_grad=False)
y = Variable(torch.randn(N, D_out), requires_grad=False)
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
w2 = Variable(torch.randn(H, D_out), requires_grad=True)
parameters = [w1, w2]
learning_rate = 1e-6
optimizer = torch.optim.SGD(parameters, lr=learning_rate)
loss_scaler = DynamicLossScaler()
for t in range(500):
y_pred = x.mm(w1).clamp(min=0).mm(w2)
loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
# Run backprop
optimizer.zero_grad()
loss.backward()
# Check for overflow
has_overflow = DynamicLossScaler.has_overflow(parameters)
# If no overflow, unscale grad and update as usual
if not has_overflow:
for param in parameters:
param.grad.data.mul_(1. / loss_scaler.loss_scale)
optimizer.step()
# Otherwise, don't do anything -- ie, skip iteration
else:
print('OVERFLOW!')
# Update loss scale for next iteration
loss_scaler.update_scale(has_overflow)
"""
...@@ -30,9 +30,16 @@ def import_layernorm(fp32_residual_connection): ...@@ -30,9 +30,16 @@ def import_layernorm(fp32_residual_connection):
from .distributed import * from .distributed import *
from .bert_model import BertModel, BertModelFirstStage, BertModelIntermediateStage, BertModelLastStage from .bert_model import (BertModel,
from .realm_model import ICTBertModel BertModelFirstStage,
from .gpt2_model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage BertModelIntermediateStage,
BertModelLastStage)
from .gpt2_model import (GPT2Model,
GPT2ModelFirstStage,
GPT2ModelIntermediateStage,
GPT2ModelLastStage)
from .language_model import get_language_model from .language_model import get_language_model
from .module import FP16Module
from .realm_model import ICTBertModel
...@@ -26,7 +26,7 @@ from megatron.model.utils import openai_gelu, erf_gelu ...@@ -26,7 +26,7 @@ from megatron.model.utils import openai_gelu, erf_gelu
from megatron.model.utils import get_linear_layer from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
from megatron.model.utils import scaled_init_method_normal from megatron.model.utils import scaled_init_method_normal
from megatron.module import MegatronModule, PipelinedMegatronModule from .module import MegatronModule
def bert_attention_mask_func(attention_scores, attention_mask): def bert_attention_mask_func(attention_scores, attention_mask):
attention_scores.masked_fill_(attention_mask, -10000.0) attention_scores.masked_fill_(attention_mask, -10000.0)
...@@ -127,7 +127,7 @@ def post_language_model_processing(lm_output, pooled_output, ...@@ -127,7 +127,7 @@ def post_language_model_processing(lm_output, pooled_output,
return lm_loss, binary_logits return lm_loss, binary_logits
class BertModelBase(PipelinedMegatronModule): class BertModelBase(MegatronModule):
"""Bert Language model.""" """Bert Language model."""
def __init__(self, num_tokentypes=2, add_binary_head=True, def __init__(self, num_tokentypes=2, add_binary_head=True,
......
...@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model ...@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
from megatron.model.utils import get_linear_layer from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
from megatron.model.utils import scaled_init_method_normal from megatron.model.utils import scaled_init_method_normal
from megatron.module import PipelinedMegatronModule from .module import MegatronModule
class ClassificationBase(PipelinedMegatronModule): class ClassificationBase(MegatronModule):
def __init__(self, num_classes, num_tokentypes=2): def __init__(self, num_classes, num_tokentypes=2):
super(ClassificationBase, self).__init__(share_word_embeddings=False) super(ClassificationBase, self).__init__(share_word_embeddings=False)
......
...@@ -20,7 +20,7 @@ from torch.nn.modules import Module ...@@ -20,7 +20,7 @@ from torch.nn.modules import Module
from torch.autograd import Variable from torch.autograd import Variable
from megatron import mpu from megatron import mpu
from megatron.module import MegatronModule from .module import MegatronModule
class DistributedDataParallel(MegatronModule): class DistributedDataParallel(MegatronModule):
......
File mode changed from 100755 to 100644
...@@ -19,7 +19,7 @@ import torch ...@@ -19,7 +19,7 @@ import torch
from megatron import get_args from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.module import PipelinedMegatronModule from .module import MegatronModule
from .language_model import parallel_lm_logits from .language_model import parallel_lm_logits
from .language_model import get_language_model from .language_model import get_language_model
...@@ -61,7 +61,7 @@ def post_language_model_processing(lm_output, labels, logit_weights, ...@@ -61,7 +61,7 @@ def post_language_model_processing(lm_output, labels, logit_weights,
return loss return loss
class GPT2ModelBase(PipelinedMegatronModule): class GPT2ModelBase(MegatronModule):
"""GPT-2 Language model.""" """GPT-2 Language model."""
def __init__(self, num_tokentypes=0, parallel_output=True): def __init__(self, num_tokentypes=0, parallel_output=True):
......
...@@ -20,7 +20,7 @@ import torch.nn.functional as F ...@@ -20,7 +20,7 @@ import torch.nn.functional as F
from megatron import get_args from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.module import MegatronModule from .module import MegatronModule
from megatron.model.transformer import ParallelTransformer from megatron.model.transformer import ParallelTransformer
from megatron.model.utils import get_linear_layer from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal, scaled_init_method_normal from megatron.model.utils import init_method_normal, scaled_init_method_normal
......
...@@ -16,16 +16,25 @@ ...@@ -16,16 +16,25 @@
"""Megatron Module""" """Megatron Module"""
import torch import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from megatron import get_args from megatron import get_args
from megatron import mpu from megatron import mpu
_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
class MegatronModule(torch.nn.Module): class MegatronModule(torch.nn.Module):
"""Megatron specific extensions of torch Module.""" """Megatron specific extensions of torch Module with support
for pipelining."""
def __init__(self): def __init__(self, share_word_embeddings=True):
super(MegatronModule, self).__init__() super(MegatronModule, self).__init__()
self.share_word_embeddings = share_word_embeddings
def state_dict_for_save_checkpoint(self, destination=None, prefix='', def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False): keep_vars=False):
...@@ -34,53 +43,115 @@ class MegatronModule(torch.nn.Module): ...@@ -34,53 +43,115 @@ class MegatronModule(torch.nn.Module):
return self.state_dict(destination, prefix, keep_vars) return self.state_dict(destination, prefix, keep_vars)
class PipelinedMegatronModule(MegatronModule):
"""Pipelining specific extensions of MegatronModule."""
def __init__(self, share_word_embeddings=True):
super(PipelinedMegatronModule, self).__init__()
args = get_args()
self.share_word_embeddings = share_word_embeddings
def word_embeddings_weight(self): def word_embeddings_weight(self):
if mpu.is_pipeline_first_stage(): if mpu.is_pipeline_first_stage():
return self.language_model.embedding.word_embeddings.weight return self.language_model.embedding.word_embeddings.weight
if mpu.is_pipeline_last_stage(): if mpu.is_pipeline_last_stage():
if not self.share_word_embeddings: if not self.share_word_embeddings:
raise Exception('word_embeddings_weight() called for last stage, ' raise Exception('word_embeddings_weight() called for last '
'but share_word_embeddings is false') 'stage, but share_word_embeddings is false')
return self.word_embeddings.weight return self.word_embeddings.weight
raise Exception('word_embeddings_weight() should be ' raise Exception('word_embeddings_weight() should be '
'called for first and last stage only') 'called for first and last stage only')
def initialize_word_embeddings(self, init_method_normal): def initialize_word_embeddings(self, init_method_normal):
args = get_args() args = get_args()
if not self.share_word_embeddings: if not self.share_word_embeddings:
raise Exception('initialize_word_embeddings() was called but ' raise Exception('initialize_word_embeddings() was called but '
'share_word_embeddings is false') 'share_word_embeddings is false')
# Parameters are shared between the word embeddings layer, and the heads at # Parameters are shared between the word embeddings layer, and the
# the end of the model. In a pipelined setup with more than one stage, the # heads at the end of the model. In a pipelined setup with more than
# initial embedding layer and the head are on different workers, so we do # one stage, the initial embedding layer and the head are on different
# the following: # workers, so we do the following:
# 1. Create a second copy of word_embeddings on the last stage, with initial # 1. Create a second copy of word_embeddings on the last stage, with
# parameters of 0.0. # initial parameters of 0.0.
# 2. Do an all-reduce between the first and last stage to ensure that the # 2. Do an all-reduce between the first and last stage to ensure that
# two copies of word_embeddings start off with the same parameter values. # the two copies of word_embeddings start off with the same
# 3. In the training loop, before an all-reduce between the grads of the two # parameter values.
# word_embeddings layers to ensure that every applied weight update is the # 3. In the training loop, before an all-reduce between the grads of
# same on both stages. # the two word_embeddings layers to ensure that every applied weight
# update is the same on both stages.
if mpu.is_pipeline_last_stage(): if mpu.is_pipeline_last_stage():
if not mpu.is_pipeline_first_stage(): if not mpu.is_pipeline_first_stage():
self._word_embeddings_for_head_key = 'word_embeddings_for_head' self._word_embeddings_for_head_key = 'word_embeddings_for_head'
# If first and last stages are different, set word_embeddings # If first and last stages are different, set word_embeddings
# weights to 0 here, then copy first stage's weights using all_reduce # weights to 0 here, then copy first stage's weights using
# below. # all_reduce below.
self.word_embeddings = mpu.VocabParallelEmbedding( self.word_embeddings = mpu.VocabParallelEmbedding(
args.padded_vocab_size, args.hidden_size, args.padded_vocab_size, args.hidden_size,
init_method=init_method_normal(args.init_method_std)) init_method=init_method_normal(args.init_method_std))
self.word_embeddings.weight.data.fill_(0) self.word_embeddings.weight.data.fill_(0)
self.word_embeddings.weight.shared = True self.word_embeddings.weight.shared = True
# Ensure that first and last stages have the same initial parameter values. # Ensure that first and last stages have the same initial parameter
# values.
if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage(): if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
torch.distributed.all_reduce(self.word_embeddings_weight().data, torch.distributed.all_reduce(self.word_embeddings_weight().data,
group=mpu.get_embedding_group()) group=mpu.get_embedding_group())
def conversion_helper(val, conversion):
"""Apply conversion to val. Recursively apply conversion if `val`
#is a nested tuple/list structure."""
if not isinstance(val, (tuple, list)):
return conversion(val)
rtn = [conversion_helper(v, conversion) for v in val]
if isinstance(val, tuple):
rtn = tuple(rtn)
return rtn
def fp32_to_fp16(val):
"""Convert fp32 `val` to fp16"""
def half_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, _FLOAT_TYPES):
val = val.half()
return val
return conversion_helper(val, half_conversion)
def fp16_to_fp32(val):
"""Convert fp16 `val` to fp32"""
def float_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, _HALF_TYPES):
val = val.float()
return val
return conversion_helper(val, float_conversion)
class FP16Module(MegatronModule):
def __init__(self, module):
super(FP16Module, self).__init__()
self.add_module('module', module.half())
def forward(self, *inputs, **kwargs):
if mpu.is_pipeline_first_stage():
inputs = fp32_to_fp16(inputs)
outputs = self.module(*inputs, **kwargs)
if mpu.is_pipeline_last_stage():
outputs = fp16_to_fp32(outputs)
return outputs
def state_dict(self, destination=None, prefix='', keep_vars=False):
return self.module.state_dict(destination, prefix, keep_vars)
def state_dict_for_save_checkpoint(self, destination=None, prefix='',
keep_vars=False):
return self.module.state_dict_for_save_checkpoint(destination, prefix,
keep_vars)
def load_state_dict(self, state_dict, strict=True):
self.module.load_state_dict(state_dict, strict=strict)
...@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model ...@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
from megatron.model.utils import get_linear_layer from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
from megatron.model.utils import scaled_init_method_normal from megatron.model.utils import scaled_init_method_normal
from megatron.module import PipelinedMegatronModule from .module import MegatronModule
class MultipleChoiceBase(PipelinedMegatronModule): class MultipleChoiceBase(MegatronModule):
def __init__(self, num_tokentypes=2): def __init__(self, num_tokentypes=2):
super(MultipleChoiceBase, self).__init__(share_word_embeddings=False) super(MultipleChoiceBase, self).__init__(share_word_embeddings=False)
......
...@@ -4,7 +4,7 @@ import torch ...@@ -4,7 +4,7 @@ import torch
from megatron import get_args, print_rank_0 from megatron import get_args, print_rank_0
from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
from megatron.model import BertModel from megatron.model import BertModel
from megatron.module import MegatronModule from .module import MegatronModule
from megatron import mpu from megatron import mpu
from megatron.model.utils import get_linear_layer from megatron.model.utils import get_linear_layer
from megatron.model.utils import init_method_normal from megatron.model.utils import init_method_normal
......
...@@ -21,7 +21,7 @@ import torch.nn.functional as F ...@@ -21,7 +21,7 @@ import torch.nn.functional as F
from megatron import get_args from megatron import get_args
from megatron import mpu from megatron import mpu
from megatron.module import MegatronModule from .module import MegatronModule
from megatron.checkpointing import get_checkpoint_version from megatron.checkpointing import get_checkpoint_version
from megatron.model import import_layernorm from megatron.model import import_layernorm
from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_softmax import FusedScaleMaskSoftmax
......
...@@ -37,7 +37,7 @@ from megatron import print_rank_0 ...@@ -37,7 +37,7 @@ from megatron import print_rank_0
from megatron import print_rank_last from megatron import print_rank_last
from megatron.checkpointing import load_checkpoint from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint from megatron.checkpointing import save_checkpoint
from megatron.fp16 import FP16_Module from megatron.model import FP16Module
from megatron.optimizer import get_megatron_optimizer from megatron.optimizer import get_megatron_optimizer
from megatron.initialize import initialize_megatron from megatron.initialize import initialize_megatron
...@@ -199,7 +199,7 @@ def get_model(model_provider_func): ...@@ -199,7 +199,7 @@ def get_model(model_provider_func):
# Fp16 conversion. # Fp16 conversion.
if args.fp16: if args.fp16:
model = FP16_Module(model) model = FP16Module(model)
if args.DDP_impl == 'torch': if args.DDP_impl == 'torch':
i = torch.cuda.current_device() i = torch.cuda.current_device()
...@@ -264,7 +264,7 @@ def setup_model_and_optimizer(model_provider_func): ...@@ -264,7 +264,7 @@ def setup_model_and_optimizer(model_provider_func):
model = get_model(model_provider_func) model = get_model(model_provider_func)
unwrapped_model = model unwrapped_model = model
while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)): while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
unwrapped_model = unwrapped_model.module unwrapped_model = unwrapped_model.module
optimizer = get_megatron_optimizer(unwrapped_model) optimizer = get_megatron_optimizer(unwrapped_model)
...@@ -588,7 +588,7 @@ def train_step(forward_step_func, data_iterator, ...@@ -588,7 +588,7 @@ def train_step(forward_step_func, data_iterator,
if (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and \ if (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and \
mpu.get_pipeline_model_parallel_world_size() > 1: mpu.get_pipeline_model_parallel_world_size() > 1:
unwrapped_model = model unwrapped_model = model
while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)): while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
unwrapped_model = unwrapped_model.module unwrapped_model = unwrapped_model.module
if unwrapped_model.share_word_embeddings: if unwrapped_model.share_word_embeddings:
......
...@@ -24,7 +24,6 @@ from megatron import print_rank_0 ...@@ -24,7 +24,6 @@ from megatron import print_rank_0
from megatron import get_adlr_autoresume from megatron import get_adlr_autoresume
from megatron import mpu from megatron import mpu
from megatron.checkpointing import save_checkpoint from megatron.checkpointing import save_checkpoint
from megatron.fp16 import FP16_Optimizer
def average_losses_across_data_parallel_group(losses): def average_losses_across_data_parallel_group(losses):
...@@ -47,11 +46,13 @@ def report_memory(name): ...@@ -47,11 +46,13 @@ def report_memory(name):
torch.cuda.memory_allocated() / mega_bytes) torch.cuda.memory_allocated() / mega_bytes)
string += ' | max allocated: {}'.format( string += ' | max allocated: {}'.format(
torch.cuda.max_memory_allocated() / mega_bytes) torch.cuda.max_memory_allocated() / mega_bytes)
string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes) string += ' | reserved: {}'.format(
torch.cuda.memory_reserved() / mega_bytes)
string += ' | max reserved: {}'.format( string += ' | max reserved: {}'.format(
torch.cuda.max_memory_reserved() / mega_bytes) torch.cuda.max_memory_reserved() / mega_bytes)
if mpu.get_data_parallel_rank() == 0: if mpu.get_data_parallel_rank() == 0:
print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True) print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
flush=True)
def print_params_min_max_norm(optimizer, iteration): def print_params_min_max_norm(optimizer, iteration):
...@@ -59,9 +60,7 @@ def print_params_min_max_norm(optimizer, iteration): ...@@ -59,9 +60,7 @@ def print_params_min_max_norm(optimizer, iteration):
index = 0 index = 0
rank = torch.distributed.get_rank() rank = torch.distributed.get_rank()
string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n' string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
optimizer_ = optimizer optimizer_ = optimizer.optimizer
if isinstance(optimizer, FP16_Optimizer):
optimizer_ = optimizer.optimizer
for param_group in optimizer_.param_groups: for param_group in optimizer_.param_groups:
for param in param_group['params']: for param in param_group['params']:
index += 1 index += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment