some cleanup

fc6c5a25 · Michael Carilli · 683b6e0e · fc6c5a25 · fc6c5a25 · fc6c5a25
Commit fc6c5a25 authored Apr 10, 2019 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 115 deletions

apex/amp/_initialize.py apex/amp/_initialize.py +1 -21

apex/amp/_process_optimizer.py apex/amp/_process_optimizer.py +72 -83

apex/amp/handle.py apex/amp/handle.py +4 -11

No files found.
--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
@@ -107,22 +107,6 @@ def check_optimizers(optimizers):
                               "on the specified opt_level (and optional overridden properties).")


-def wrap_fused_adam(optimizer, properties):
-    msg = 'Currently, the usage of FusedAdam is restricted to '\
-          'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '\
-          'loss_scale=float or "dynamic").  We are working on enabling more general usage.'
-
-    assert properties.master_weights is True, msg
-    assert properties.cast_model_type is torch.float16, msg
-    assert (properties.keep_batchnorm_fp32 is False or
-            properties.keep_batchnorm_fp32 is None), msg
-
-    if properties.loss_scale == "dynamic":
-        return FP16_Optimizer_for_fused(optimizer, dynamic_loss_scale=True)
-    else:
-        return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)
-
-
 def _initialize(models, optimizers, properties, num_losses=1):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init
@@ -184,11 +168,7 @@ def _initialize(models, optimizers, properties, num_losses=1):
            optimizer.load_state_dict(optimizer.state_dict())

    for i, optimizer in enumerate(optimizers):
-        # Still need to special case this for the first pass
-        if isinstance(optimizer, FusedAdam):
-            optimizers[i] = wrap_fused_adam(optimizer, properties)
-        else:
-            optimizers[i] = _process_optimizer(optimizer, properties)
+        optimizers[i] = _process_optimizer(optimizer, properties)

    _amp_state.loss_scalers = []
    for _ in range(num_losses):

--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
 from ..multi_tensor_apply import multi_tensor_applier
 from ._amp_state import maybe_print
 import torch
+from ..optimizers import FusedAdam


 class AmpOptimizerState(object):
@@ -73,6 +74,40 @@ def lazy_init_with_master_weights(self):
        self.load_state_dict(self.state_dict())


+def post_backward_models_are_masters(scaler, params, stashed_grads):
+        # This is a lot of python overhead...
+        grads_needing_unscale = []
+        grads_needing_unscale_with_stash = []
+        stashed = []
+        for param, stashed_grad in zip(params, stashed_grads):
+            if param.grad is None and stashed_grad is not None:
+                param.grad = stashed_grad
+            elif param.grad is not None and stashed_grad is None:
+                grads_needing_unscale.append(param.grad)
+            elif param.grad is not None and stashed_grad is not None:
+                grads_needing_unscale_with_stash.append(param.grad)
+                stashed.append(stashed_grad)
+            else: # param.grad is None and stashed_grad is None
+                continue
+
+        if len(grads_needing_unscale) > 0:
+            scaler.unscale(
+                grads_needing_unscale,
+                grads_needing_unscale,
+                scaler.loss_scale(),
+                models_are_masters=True)
+
+        if len(grads_needing_unscale_with_stash) > 0:
+            scaler.unscale_with_stashed(
+                grads_needing_unscale_with_stash,
+                stashed,
+                grads_needing_unscale_with_stash)
+
+        # Clear the stash.
+        for i in range(len(stashed_grads)):
+            stashed_grads[i] = None
+
+
 def prepare_backward_with_master_weights(self):
    stash = self._amp_stash

@@ -106,7 +141,7 @@ def post_backward_with_master_weights(self, scaler):
        if fp16_param.grad is None and fp32_param.grad is not None:
            continue
        elif fp16_param.grad is not None and fp32_param.grad is None:
-            fp32_param.grad = torch.empty_like(fp32_param) 
+            fp32_param.grad = torch.empty_like(fp32_param)
            fp16_grads_needing_unscale.append(fp16_param.grad)
            new_fp32_grads.append(fp32_param.grad)
        elif fp16_param.grad is not None and fp32_param.grad is not None:
@@ -129,37 +164,10 @@ def post_backward_with_master_weights(self, scaler):
            preexisting_fp32_grads)

    # fp32 params can be treated as they would be in the "no_master_weights" case.
-    grads_needing_unscale = []
-    grads_needing_unscale_with_stash = []
-    stashed = []
-    for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
-                                   stash.all_fp32_from_fp32_grad_stash):
-        if param.grad is None and stashed_grad is not None:
-            param.grad = stashed_grad
-        elif param.grad is not None and stashed_grad is None:
-            grads_needing_unscale.append(param.grad)
-        elif param.grad is not None and stashed_grad is not None:
-            grads_needing_unscale_with_stash.append(param.grad)
-            stashed.append(stashed_grad)
-        else: # param.grad is None and stashed_grad is None:
-            continue
-
-    if len(grads_needing_unscale) > 0:
-        scaler.unscale(
-            grads_needing_unscale,
-            grads_needing_unscale,
-            scaler.loss_scale(),
-            models_are_masters=True)
-
-    if len(grads_needing_unscale_with_stash) > 0:
-        scaler.unscale_with_stashed(
-            grads_needing_unscale_with_stash,
-            stashed,
-            grads_needing_unscale_with_stash)
-
-    # Clear the stash.
-    for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
-        stash.all_fp32_from_fp32_grad_stash[i] = None
+    post_backward_models_are_masters(
+        scaler,
+        stash.all_fp32_from_fp32_params,
+        stash.all_fp32_from_fp32_grad_stash)


 def lazy_init_no_master_weights(self):
@@ -176,7 +184,7 @@ def lazy_init_no_master_weights(self):
                raise TypeError("Optimizer's parameters must be either "
                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                                "Received {}".format(param.type()))
-    
+
    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]

@@ -206,37 +214,7 @@ def post_backward_no_master_weights(self, scaler):
             (stash.all_fp32_params, stash.all_fp32_grad_stash))

    for params, stashed_grads in split_types:
-        # This is a lot of python overhead...
-        grads_needing_unscale = []
-        grads_needing_unscale_with_stash = []
-        stashed = []
-        for param, stashed_grad in zip(params, stashed_grads):
-            if param.grad is None and stashed_grad is not None:
-                param.grad = stashed_grad
-            elif param.grad is not None and stashed_grad is None:
-                grads_needing_unscale.append(param.grad)
-            elif param.grad is not None and stashed_grad is not None:
-                grads_needing_unscale_with_stash.append(param.grad)
-                stashed.append(stashed_grad)
-            else: # param.grad is None and stashed_grad is None
-                continue
-
-        if len(grads_needing_unscale) > 0:
-            scaler.unscale(
-                grads_needing_unscale,
-                grads_needing_unscale,
-                scaler.loss_scale(),
-                models_are_masters=True)
-
-        if len(grads_needing_unscale_with_stash) > 0:
-            scaler.unscale_with_stashed(
-                grads_needing_unscale_with_stash,
-                stashed,
-                grads_needing_unscale_with_stash)
-
-        # Clear the stash.
-        for i in range(len(stashed_grads)):
-            stashed_grads[i] = None
+        post_backward_models_are_masters(scaler, params, stashed_grads)


 def _master_params_to_model_params(self):
@@ -283,15 +261,16 @@ def _process_optimizer(optimizer, properties):
        optimizer._master_params_to_model_params = types.MethodType(
            _master_params_to_model_params, optimizer)

-        old_step = optimizer.step
-        def new_step(self):
-            retval = old_step()
-            self._master_params_to_model_params()
-            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
-            for param in self._amp_stash.all_fp32_from_fp16_params:
-                param.grad = None
-            return retval
-        optimizer.step = types.MethodType(new_step, optimizer)
+        if not isinstance(optimizer, FusedAdam):
+            old_step = optimizer.step
+            def new_step(self):
+                retval = old_step()
+                self._master_params_to_model_params()
+                # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+                for param in self._amp_stash.all_fp32_from_fp16_params:
+                    param.grad = None
+                return retval
+            optimizer.step = types.MethodType(new_step, optimizer)

        old_zero_grad = optimizer.zero_grad
        def new_zero_grad(self):
@@ -313,19 +292,29 @@ def _process_optimizer(optimizer, properties):
                param.grad = None
        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)

-        optimizer._prepare_amp_backward = types.MethodType(
-            prepare_backward_with_master_weights, optimizer)
-
-        optimizer._post_amp_backward = types.MethodType(
-            post_backward_with_master_weights, optimizer)
+        if isinstance(optimizer, FusedAdam):
+            optimizer._prepare_amp_backward = types.MethodType(
+                prepare_backward_with_master_weights_fused, optimizer)
+            optimizer._post_amp_backward = types.MethodType(
+                post_backward_with_master_weights_fused, optimizer)
+        else:
+            optimizer._prepare_amp_backward = types.MethodType(
+                prepare_backward_with_master_weights, optimizer)
+            optimizer._post_amp_backward = types.MethodType(
+                post_backward_with_master_weights, optimizer)
    else:
        optimizer._lazy_init_maybe_master_weights = types.MethodType(
            lazy_init_no_master_weights, optimizer)

-        optimizer._prepare_amp_backward = types.MethodType(
-            prepare_backward_no_master_weights, optimizer)
-
-        optimizer._post_amp_backward = types.MethodType(
-            post_backward_no_master_weights, optimizer)
+        if isinstance(optimizer, FusedAdam):
+            optimizer._prepare_amp_backward = types.MethodType(
+                prepare_backward_no_master_weights_fused, optimizer)
+            optimizer._post_amp_backward = types.MethodType(
+                post_backward_no_master_weights_fused, optimizer)
+        else:
+            optimizer._prepare_amp_backward = types.MethodType(
+                prepare_backward_no_master_weights, optimizer)
+            optimizer._post_amp_backward = types.MethodType(
+                post_backward_no_master_weights, optimizer)

    return optimizer
--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -6,8 +6,6 @@ from . import utils
 from .opt import OptimWrapper
 from .scaler import LossScaler
 from ._amp_state import _amp_state, master_params, maybe_print
-from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
-from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused


 # There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@@ -82,13 +80,8 @@ def scale_loss(loss,
    if isinstance(optimizers, torch.optim.Optimizer):
        optimizers = [optimizers]

-    # this is what happens when i have to support tools from different sources under the same API...
-    # TODO:  Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
-    if isinstance(optimizers, FP16_Optimizer_for_fused):
-        loss_scale = optimizers.cur_scale
-    else:
-        loss_scaler = _amp_state.loss_scalers[loss_id]
-        loss_scale = loss_scaler.loss_scale()
+    loss_scaler = _amp_state.loss_scalers[loss_id]
+    loss_scale = loss_scaler.loss_scale()

    if ((not _amp_state.opt_properties.master_weights)
        and (not loss_scaler.dynamic)
@@ -113,8 +106,8 @@ def scale_loss(loss,
        for optimizer in optimizers:
            optimizer._amp_stash.params_have_scaled_gradients = True
    else:
-        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
-        if not isinstance(optimizers, FP16_Optimizer_for_fused):
+        # FusedAdam and FusedSGD may take care of unscaling as part of their step() methods.
+        # if not isinstance(optimizers, FP16_Optimizer_for_fused):
            loss_scaler.clear_overflow_state()
            for optimizer in optimizers:
                optimizer._post_amp_backward(loss_scaler)