remove temporarily added lr warmup

This way optimizer will be more general, and warm up should be handled by user

remove temporarily added lr warmup
This way optimizer will be more general, and warm up should be handled by user
c8ca4bf4 · Deyu Fu · 4d9dcb57 · c8ca4bf4 · c8ca4bf4
Commit c8ca4bf4 authored Dec 10, 2018 by Deyu Fu
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 32 deletions

apex/optimizers/fp16_optimizer.py apex/optimizers/fp16_optimizer.py +20 -0

apex/optimizers/fused_adam.py apex/optimizers/fused_adam.py +7 -32

No files found.
--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
@@ -195,3 +195,23 @@ class FP16_Optimizer(object):
                print("Using static loss scale of", self.cur_scale)
        self.cur_iter +=1
        return
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
-import math
 import torch
 import fused_adam_cuda

-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
-
-def warmup_constant(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 1.0
-
-def warmup_linear(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 1.0 - x
-
-SCHEDULES = {
-    'warmup_cosine':warmup_cosine,
-    'warmup_constant':warmup_constant,
-    'warmup_linear':warmup_linear,
-}
-
 class FusedAdam(torch.optim.Optimizer):

    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
@@ -54,12 +32,12 @@ class FusedAdam(torch.optim.Optimizer):
    """

    def __init__(self, params,
-                 lr=1e-3, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 lr=1e-3, bias_correction = True,
                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
                 weight_decay=0., max_grad_norm=0., amsgrad=False):
        if amsgrad:
            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+        defaults = dict(lr=lr, bias_correction=bias_correction,
                        betas=betas, eps=eps, weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm)
        super(FusedAdam, self).__init__(params, defaults)
@@ -117,6 +95,11 @@ class FusedAdam(torch.optim.Optimizer):
                if clip > 1:
                    combined_scale = clip * scale

+            # set bias correction for this group
+            bias_correction = 0
+            if group['bias_correction']:
+                bias_correction = 1
+
            for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
                #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
                if p.grad is None and grad is None:
@@ -139,14 +122,6 @@ class FusedAdam(torch.optim.Optimizer):
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                    bias_correction = 0
-                else:
-                    lr_scheduled = group['lr']
-                    bias_correction = 1
-
                state['step'] += 1

                out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param