Unverified Commit 91aab2a6 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #116 from FDecaYed/deyuf/fp16_with_apex

Change to use apex for better fp16 and multi-gpu support
parents 32a227f5 3b0a14b7
...@@ -338,7 +338,7 @@ The optimizer accepts the following arguments: ...@@ -338,7 +338,7 @@ The optimizer accepts the following arguments:
- `b1` : Adams b1. Default : `0.9` - `b1` : Adams b1. Default : `0.9`
- `b2` : Adams b2. Default : `0.999` - `b2` : Adams b2. Default : `0.999`
- `e` : Adams epsilon. Default : `1e-6` - `e` : Adams epsilon. Default : `1e-6`
- `weight_decay_rate:` Weight decay. Default : `0.01` - `weight_decay:` Weight decay. Default : `0.01`
- `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0` - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
## Examples ## Examples
......
# coding=utf-8 # coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -35,6 +36,13 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification ...@@ -35,6 +36,13 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this.")
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S', datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO) level = logging.INFO)
...@@ -295,34 +303,10 @@ def accuracy(out, labels): ...@@ -295,34 +303,10 @@ def accuracy(out, labels):
outputs = np.argmax(out, axis=1) outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels) return np.sum(outputs == labels)
def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): def warmup_linear(x, warmup=0.002):
""" Utility function for optimize_on_cpu and 16-bits training. if x < warmup:
Copy the parameters optimized on CPU/RAM back to the model on GPU return x/warmup
""" return 1.0 - x
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
if name_opti != name_model:
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
raise ValueError
param_model.data.copy_(param_opti.data)
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan = False
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
if name_opti != name_model:
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
raise ValueError
if param_model.grad is not None:
if test_nan and torch.isnan(param_model.grad).sum() > 0:
is_nan = True
if param_opti.grad is None:
param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
param_opti.grad.data.copy_(param_model.grad.data)
else:
param_opti.grad = None
return is_nan
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -403,17 +387,15 @@ def main(): ...@@ -403,17 +387,15 @@ def main():
type=int, type=int,
default=1, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--optimize_on_cpu',
default=False,
action='store_true',
help="Whether to perform optimization and keep the optimizer averages on CPU")
parser.add_argument('--fp16', parser.add_argument('--fp16',
default=False, default=False,
action='store_true', action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit") help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale', parser.add_argument('--loss_scale',
type=float, default=128, type=float, default=0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.') help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args() args = parser.parse_args()
...@@ -433,13 +415,11 @@ def main(): ...@@ -433,13 +415,11 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count() n_gpu = torch.cuda.device_count()
else: else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
n_gpu = 1 n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
if args.fp16:
logger.info("16-bits training currently not supported in distributed training")
args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
if args.gradient_accumulation_steps < 1: if args.gradient_accumulation_steps < 1:
...@@ -487,32 +467,35 @@ def main(): ...@@ -487,32 +467,35 @@ def main():
model.half() model.half()
model.to(device) model.to(device)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], model = DDP(model)
output_device=args.local_rank)
elif n_gpu > 1: elif n_gpu > 1:
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
if args.fp16: param_optimizer = list(model.named_parameters())
param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
for n, param in model.named_parameters()]
elif args.optimize_on_cpu:
param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
for n, param in model.named_parameters()]
else:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
t_total = num_train_steps t_total = num_train_steps
if args.local_rank != -1: if args.local_rank != -1:
t_total = t_total // torch.distributed.get_world_size() t_total = t_total // torch.distributed.get_world_size()
optimizer = BertAdam(optimizer_grouped_parameters, if args.fp16:
lr=args.learning_rate, optimizer = FusedAdam(optimizer_grouped_parameters,
warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=t_total) bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=t_total)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
...@@ -543,34 +526,24 @@ def main(): ...@@ -543,34 +526,24 @@ def main():
loss = model(input_ids, segment_ids, input_mask, label_ids) loss = model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1: if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu. loss = loss.mean() # mean() to average on multi-gpu.
if args.fp16 and args.loss_scale != 1.0:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1: if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps
loss.backward()
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item() tr_loss += loss.item()
nb_tr_examples += input_ids.size(0) nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1 nb_tr_steps += 1
if (step + 1) % args.gradient_accumulation_steps == 0: if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16 or args.optimize_on_cpu: # modify learning rate with special warm up BERT uses
if args.fp16 and args.loss_scale != 1.0: lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
# scale down gradients for fp16 training for param_group in optimizer.param_groups:
for param in model.parameters(): param_group['lr'] = lr_this_step
if param.grad is not None: optimizer.step()
param.grad.data = param.grad.data / args.loss_scale optimizer.zero_grad()
is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
if is_nan:
logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
args.loss_scale = args.loss_scale / 2
model.zero_grad()
continue
optimizer.step()
copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
else:
optimizer.step()
model.zero_grad()
global_step += 1 global_step += 1
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
......
# coding=utf-8 # coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -38,7 +39,14 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering ...@@ -38,7 +39,14 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this.")
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S', datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO) level = logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -669,34 +677,10 @@ def _compute_softmax(scores): ...@@ -669,34 +677,10 @@ def _compute_softmax(scores):
probs.append(score / total_sum) probs.append(score / total_sum)
return probs return probs
def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): def warmup_linear(x, warmup=0.002):
""" Utility function for optimize_on_cpu and 16-bits training. if x < warmup:
Copy the parameters optimized on CPU/RAM back to the model on GPU return x/warmup
""" return 1.0 - x
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
if name_opti != name_model:
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
raise ValueError
param_model.data.copy_(param_opti.data)
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan = False
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
if name_opti != name_model:
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
raise ValueError
if param_model.grad is not None:
if test_nan and torch.isnan(param_model.grad).sum() > 0:
is_nan = True
if param_opti.grad is None:
param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
param_opti.grad.data.copy_(param_model.grad.data)
else:
param_opti.grad = None
return is_nan
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -743,8 +727,8 @@ def main(): ...@@ -743,8 +727,8 @@ def main():
default=False, default=False,
action='store_true', action='store_true',
help="Whether not to use CUDA when available") help="Whether not to use CUDA when available")
parser.add_argument('--seed', parser.add_argument('--seed',
type=int, type=int,
default=42, default=42,
help="random seed for initialization") help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps', parser.add_argument('--gradient_accumulation_steps',
...@@ -759,17 +743,15 @@ def main(): ...@@ -759,17 +743,15 @@ def main():
type=int, type=int,
default=-1, default=-1,
help="local_rank for distributed training on gpus") help="local_rank for distributed training on gpus")
parser.add_argument('--optimize_on_cpu',
default=False,
action='store_true',
help="Whether to perform optimization and keep the optimizer averages on CPU")
parser.add_argument('--fp16', parser.add_argument('--fp16',
default=False, default=False,
action='store_true', action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit") help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale', parser.add_argument('--loss_scale',
type=float, default=128, type=float, default=0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.') help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args() args = parser.parse_args()
...@@ -777,13 +759,11 @@ def main(): ...@@ -777,13 +759,11 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count() n_gpu = torch.cuda.device_count()
else: else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
n_gpu = 1 n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
if args.fp16:
logger.info("16-bits training currently not supported in distributed training")
args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format( logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16)) device, n_gpu, bool(args.local_rank != -1), args.fp16))
...@@ -828,36 +808,45 @@ def main(): ...@@ -828,36 +808,45 @@ def main():
# Prepare model # Prepare model
model = BertForQuestionAnswering.from_pretrained(args.bert_model, model = BertForQuestionAnswering.from_pretrained(args.bert_model,
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
if args.fp16: if args.fp16:
model.half() model.half()
model.to(device) model.to(device)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], model = DDP(model)
output_device=args.local_rank)
elif n_gpu > 1: elif n_gpu > 1:
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
if args.fp16: param_optimizer = list(model.named_parameters())
param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
for n, param in model.named_parameters()] # hack to remove pooler, which is not used
elif args.optimize_on_cpu: # thus it produce None grad that break apex
param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
for n, param in model.named_parameters()]
else: no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
t_total = num_train_steps t_total = num_train_steps
if args.local_rank != -1: if args.local_rank != -1:
t_total = t_total // torch.distributed.get_world_size() t_total = t_total // torch.distributed.get_world_size()
optimizer = BertAdam(optimizer_grouped_parameters, if args.fp16:
lr=args.learning_rate, optimizer = FusedAdam(optimizer_grouped_parameters,
warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=t_total) bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=t_total)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
...@@ -906,31 +895,20 @@ def main(): ...@@ -906,31 +895,20 @@ def main():
loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
if n_gpu > 1: if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu. loss = loss.mean() # mean() to average on multi-gpu.
if args.fp16 and args.loss_scale != 1.0:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1: if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps
loss.backward()
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0: if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16 or args.optimize_on_cpu: # modify learning rate with special warm up BERT uses
if args.fp16 and args.loss_scale != 1.0: lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
# scale down gradients for fp16 training for param_group in optimizer.param_groups:
for param in model.parameters(): param_group['lr'] = lr_this_step
if param.grad is not None: optimizer.step()
param.grad.data = param.grad.data / args.loss_scale optimizer.zero_grad()
is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
if is_nan:
logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
args.loss_scale = args.loss_scale / 2
model.zero_grad()
continue
optimizer.step()
copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
else:
optimizer.step()
model.zero_grad()
global_step += 1 global_step += 1
if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
......
# coding=utf-8 # coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -33,7 +34,7 @@ from torch.nn import CrossEntropyLoss ...@@ -33,7 +34,7 @@ from torch.nn import CrossEntropyLoss
from .file_utils import cached_path from .file_utils import cached_path
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S', datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO) level = logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -152,22 +153,24 @@ class BertConfig(object): ...@@ -152,22 +153,24 @@ class BertConfig(object):
"""Serializes this instance to a JSON string.""" """Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
try:
class BertLayerNorm(nn.Module): from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
def __init__(self, config, variance_epsilon=1e-12): except ImportError:
"""Construct a layernorm module in the TF style (epsilon inside the square root). print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
""" class BertLayerNorm(nn.Module):
super(BertLayerNorm, self).__init__() def __init__(self, hidden_size, eps=1e-12):
self.gamma = nn.Parameter(torch.ones(config.hidden_size)) """Construct a layernorm module in the TF style (epsilon inside the square root).
self.beta = nn.Parameter(torch.zeros(config.hidden_size)) """
self.variance_epsilon = variance_epsilon super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
def forward(self, x): self.bias = nn.Parameter(torch.zeros(hidden_size))
u = x.mean(-1, keepdim=True) self.variance_epsilon = eps
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon) def forward(self, x):
return self.gamma * x + self.beta u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
class BertEmbeddings(nn.Module): class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings.
...@@ -180,7 +183,7 @@ class BertEmbeddings(nn.Module): ...@@ -180,7 +183,7 @@ class BertEmbeddings(nn.Module):
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file # any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None): def forward(self, input_ids, token_type_ids=None):
...@@ -255,7 +258,7 @@ class BertSelfOutput(nn.Module): ...@@ -255,7 +258,7 @@ class BertSelfOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super(BertSelfOutput, self).__init__() super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor): def forward(self, hidden_states, input_tensor):
...@@ -294,7 +297,7 @@ class BertOutput(nn.Module): ...@@ -294,7 +297,7 @@ class BertOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super(BertOutput, self).__init__() super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor): def forward(self, hidden_states, input_tensor):
...@@ -322,7 +325,7 @@ class BertEncoder(nn.Module): ...@@ -322,7 +325,7 @@ class BertEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super(BertEncoder, self).__init__() super(BertEncoder, self).__init__()
layer = BertLayer(config) layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = [] all_encoder_layers = []
...@@ -356,7 +359,7 @@ class BertPredictionHeadTransform(nn.Module): ...@@ -356,7 +359,7 @@ class BertPredictionHeadTransform(nn.Module):
self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.transform_act_fn = ACT2FN[config.hidden_act] \ self.transform_act_fn = ACT2FN[config.hidden_act] \
if isinstance(config.hidden_act, str) else config.hidden_act if isinstance(config.hidden_act, str) else config.hidden_act
self.LayerNorm = BertLayerNorm(config) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
...@@ -439,8 +442,8 @@ class PreTrainedBertModel(nn.Module): ...@@ -439,8 +442,8 @@ class PreTrainedBertModel(nn.Module):
# cf https://github.com/pytorch/pytorch/pull/5617 # cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm): elif isinstance(module, BertLayerNorm):
module.beta.data.normal_(mean=0.0, std=self.config.initializer_range) module.bias.data.normal_(mean=0.0, std=self.config.initializer_range)
module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None: if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
...@@ -449,7 +452,7 @@ class PreTrainedBertModel(nn.Module): ...@@ -449,7 +452,7 @@ class PreTrainedBertModel(nn.Module):
""" """
Instantiate a PreTrainedBertModel from a pre-trained model file. Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed. Download and cache the pre-trained model file if needed.
Params: Params:
pretrained_model_name: either: pretrained_model_name: either:
- a str with the name of a pre-trained model to load selected in the list of: - a str with the name of a pre-trained model to load selected in the list of:
...@@ -505,6 +508,20 @@ class PreTrainedBertModel(nn.Module): ...@@ -505,6 +508,20 @@ class PreTrainedBertModel(nn.Module):
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict = torch.load(weights_path) state_dict = torch.load(weights_path)
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma','weight')
if 'beta' in key:
new_key = key.replace('beta','bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key]=state_dict.pop(old_key)
missing_keys = [] missing_keys = []
unexpected_keys = [] unexpected_keys = []
error_msgs = [] error_msgs = []
......
...@@ -53,11 +53,11 @@ class BertAdam(Optimizer): ...@@ -53,11 +53,11 @@ class BertAdam(Optimizer):
b1: Adams b1. Default: 0.9 b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999 b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6 e: Adams epsilon. Default: 1e-6
weight_decay_rate: Weight decay. Default: 0.01 weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
""" """
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
max_grad_norm=1.0): max_grad_norm=1.0):
if lr is not required and lr < 0.0: if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
...@@ -72,7 +72,7 @@ class BertAdam(Optimizer): ...@@ -72,7 +72,7 @@ class BertAdam(Optimizer):
if not e >= 0.0: if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm) max_grad_norm=max_grad_norm)
super(BertAdam, self).__init__(params, defaults) super(BertAdam, self).__init__(params, defaults)
...@@ -140,8 +140,8 @@ class BertAdam(Optimizer): ...@@ -140,8 +140,8 @@ class BertAdam(Optimizer):
# Instead we want to decay the weights in a manner that doesn't interact # Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square # with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD. # of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay_rate'] > 0.0: if group['weight_decay'] > 0.0:
update += group['weight_decay_rate'] * p.data update += group['weight_decay'] * p.data
if group['t_total'] != -1: if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']] schedule_fct = SCHEDULES[group['schedule']]
......
...@@ -35,7 +35,7 @@ class OptimizationTest(unittest.TestCase): ...@@ -35,7 +35,7 @@ class OptimizationTest(unittest.TestCase):
criterion = torch.nn.MSELoss(reduction='elementwise_mean') criterion = torch.nn.MSELoss(reduction='elementwise_mean')
# No warmup, constant schedule, no gradient clipping # No warmup, constant schedule, no gradient clipping
optimizer = BertAdam(params=[w], lr=2e-1, optimizer = BertAdam(params=[w], lr=2e-1,
weight_decay_rate=0.0, weight_decay=0.0,
max_grad_norm=-1) max_grad_norm=-1)
for _ in range(100): for _ in range(100):
loss = criterion(w, target) loss = criterion(w, target)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment