Commit 43522e63 authored by Michael Carilli's avatar Michael Carilli
Browse files

Making static loss scale the default, and clipping master grads when running with --fp16

parent aed3086a
...@@ -119,7 +119,9 @@ def main(): ...@@ -119,7 +119,9 @@ def main():
if args.static_loss_scale != 1.0: if args.static_loss_scale != 1.0:
if not args.fp16: if not args.fp16:
print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") print("Warning: static_loss_scale != 1.0 is only necessary with --fp16. "
"Resetting static_loss_scale to 1.0")
args.static_loss_scale = 1.0
# create model # create model
if args.pretrained: if args.pretrained:
......
...@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt', ...@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help='path to save the final model') help='path to save the final model')
parser.add_argument('--fp16', action='store_true', parser.add_argument('--fp16', action='store_true',
help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).') help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--static-loss-scale', type=float, default=1, parser.add_argument('--static-loss-scale', type=float, default=128.0,
help='Static loss scale, positive power of 2 values can improve fp16 convergence.') help='Static loss scale, positive power of 2 values can improve fp16 convergence.')
args = parser.parse_args() args = parser.parse_args()
...@@ -118,6 +118,12 @@ if args.cuda and args.fp16: ...@@ -118,6 +118,12 @@ if args.cuda and args.fp16:
model_params, master_params = prep_param_lists(model) model_params, master_params = prep_param_lists(model)
elif args.cuda: elif args.cuda:
model.cuda() model.cuda()
if (not args.fp16) or (not args.cuda):
print("Warning: static_loss_scale != 1.0 is only necessary with --fp16. "
"Resetting static_loss_scale to 1.0")
args.static_loss_scale = 1.0
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
############################################################################### ###############################################################################
...@@ -184,21 +190,21 @@ def train(): ...@@ -184,21 +190,21 @@ def train():
loss = criterion(output.view(-1, ntokens), targets) loss = criterion(output.view(-1, ntokens), targets)
loss = loss * args.static_loss_scale loss = loss * args.static_loss_scale
loss.backward() loss.backward()
loss = loss / args.static_loss_scale loss.data = loss.data / args.static_loss_scale
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
# apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm"
# and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.
# It's not FP16-specific, just a small fix to avoid deprecation warnings.
clip_grad_norm(model.parameters(), args.clip)
if args.fp16 and args.cuda: if args.fp16 and args.cuda:
model_grads_to_master_grads(model_params, master_params) model_grads_to_master_grads(model_params, master_params)
if args.static_loss_scale != 1:
for param in master_params:
param.grad.data = param.grad.data/args.static_loss_scale
clip_grad_norm(master_params, args.clip)
for param in master_params: for param in master_params:
param.data = param.data - param.grad.data * (lr/args.static_loss_scale) param.data = param.data - param.grad.data * lr
master_params_to_model_params(model_params, master_params) master_params_to_model_params(model_params, master_params)
else: else:
clip_grad_norm(model.parameters(), args.clip)
for p in model.parameters(): for p in model.parameters():
p.data.add_(-lr/args.static_loss_scale, p.grad.data) p.data.add_(-lr, p.grad.data)
total_loss += loss.data total_loss += loss.data
......
...@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt', ...@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help='path to save the final model') help='path to save the final model')
parser.add_argument('--fp16', action='store_true', parser.add_argument('--fp16', action='store_true',
help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).') help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
parser.add_argument('--static-loss-scale', type=float, default=1, parser.add_argument('--static-loss-scale', type=float, default=128.0,
help='Static loss scale, positive power of 2 values can improve fp16 convergence.') help='Static loss scale, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--dynamic-loss-scale', action='store_true', parser.add_argument('--dynamic-loss-scale', action='store_true',
help='Use dynamic loss scaling. If supplied, this argument supersedes ' + help='Use dynamic loss scaling. If supplied, this argument supersedes ' +
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment