Adding verbose option to FP16_Optimizer

75a865e3 · Michael Carilli · cb6d8f1a · 75a865e3
Commit 75a865e3 authored Sep 07, 2018 by Michael Carilli
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

apex/fp16_utils/fp16_optimizer.py apex/fp16_utils/fp16_optimizer.py +14 -6

No files found.
--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -40,6 +40,7 @@ class FP16_Optimizer(object):
        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
    ``init_optimizer`` is expected to have been constructed in the ordinary way.  
    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be 
@@ -105,10 +106,13 @@ class FP16_Optimizer(object):
                 init_optimizer, 
                 static_loss_scale=1.0, 
                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None):
+                 dynamic_loss_args=None,
+                 verbose=True):
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")
+        self.verbose = verbose
        self.optimizer = init_optimizer
        # init_state_dict sets up an alternative way to cast per-param state tensors.
        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
@@ -118,14 +122,14 @@ class FP16_Optimizer(object):
        self.fp32_from_fp16_groups = []
        self.fp32_from_fp32_groups = []
        for i, param_group in enumerate(self.optimizer.param_groups):
-            print("FP16_Optimizer processing param group {}:".format(i))
+            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(param_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
-                        print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
                                         .format(param.size()))
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
@@ -137,7 +141,7 @@ class FP16_Optimizer(object):
                        if param in self.optimizer.state:
                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
                    elif param.type() == 'torch.cuda.FloatTensor':
-                        print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                                         .format(param.size()))
                        fp32_params_this_group.append(param)
                        param_group['params'][i] = param
@@ -170,6 +174,10 @@ class FP16_Optimizer(object):
        self.clip_grad_norm = clip_grad_norm
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
    def __getstate__(self):
        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")