Commit 48269d8d authored by Mostofa Patwary's avatar Mostofa Patwary
Browse files

added adam betas and eps as arguments

parent 42d21122
...@@ -199,6 +199,15 @@ def _add_regularization_args(parser): ...@@ -199,6 +199,15 @@ def _add_regularization_args(parser):
help='Weight decay coefficient for L2 regularization.') help='Weight decay coefficient for L2 regularization.')
group.add_argument('--clip-grad', type=float, default=1.0, group.add_argument('--clip-grad', type=float, default=1.0,
help='Gradient clipping based on global L2 norm.') help='Gradient clipping based on global L2 norm.')
group.add_argument('--adam-beta1', type=float, default=0.9,
help='First coefficient for computing running averages of'
'gradient and its square')
group.add_argument('--adam-beta2', type=float, default=0.999,
help='Second coefficient for computing running averages of'
'gradient and its square')
group.add_argument('--adam-eps', type=float, default=1e-08,
help='term added to the denominator to improve'
'numerical stability')
return parser return parser
......
...@@ -164,7 +164,8 @@ def get_optimizer(model): ...@@ -164,7 +164,8 @@ def get_optimizer(model):
param.model_parallel = False param.model_parallel = False
# Use Adam. # Use Adam.
optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay,
betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)
# Wrap into fp16 optimizer. # Wrap into fp16 optimizer.
if args.fp16: if args.fp16:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment