Stashing work

d137b800 · Michael Carilli · 80a3f3ca · d137b800 · d137b800 · d137b800
Commit d137b800 authored Feb 24, 2019 by Michael Carilli
5 changed files
--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
@@ -4,23 +4,9 @@ import functools
 from apex.fp16_utils import convert_network
 from ._amp_state import _amp_state
 from .scaler import LossScaler
-from ..fp16_utils import FP16_Optimizer
+from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
+from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
+from ..optimizers import FusedAdam
-def check_params_fp32(model):
-    for name, param in model.named_parameters():
-        if param.is_floating_point() and param.type() != "torch.cuda.FloatTensor":
-            print("Warning:  Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                  "When using amp.initialize, you do not need to call .half() on your model\n"
-                  "before passing it, no matter what optimization level you choose.".format(
-                  name, param.type()))
-    for name, buf in model.named_buffers():
-        if buf.is_floating_point() and buf.type() != "torch.cuda.FloatTensor":
-            print("Warning:  Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                  "When using amp.initialize, you do not need to call .half() on your model\n"
-                  "before passing it, no matter what optimization level you choose.".format(
-                  name, buf.type()))
 def to_type(dtype, t):
@@ -48,6 +34,56 @@ def applier(value, fn):
        return value
+def check_models(models):
+    for model in models:
+        parallel_type = None
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            parallel_type = "torch.nn.parallel.DistributedDataParallel"
+        if isinstance(model, apex_DDP):
+            parallel_type = "apex.parallel.DistributedDataParallel"
+        if isinstance(model, torch.nn.parallel.DataParallel):
+            parallel_type = "torch.nn.parallel.DataParallel"
+        if parallel_type is not None:
+            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
+                "Parallel wrappers should only be applied to the model(s) AFTER \n"
+                "the model(s) have been returned from amp.initialize.")
+def check_params_fp32(models):
+    for model in models:
+        for name, param in model.named_parameters():
+            if param.is_floating_point() and param.type() != "torch.cuda.FloatTensor":
+                print("Warning:  Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+                      "When using amp.initialize, you do not need to call .half() on your model\n"
+                      "before passing it, no matter what optimization level you choose.".format(
+                      name, param.type()))
+        for name, buf in model.named_buffers():
+            if buf.is_floating_point() and buf.type() != "torch.cuda.FloatTensor":
+                print("Warning:  Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+                      "When using amp.initialize, you do not need to call .half() on your model\n"
+                      "before passing it, no matter what optimization level you choose.".format(
+                      name, buf.type()))
+def check_optimizers(optimizers):
+    for optim in optimizers:
+        bad_optim_type = None
+        if isinstance(optim, FP16_Optimizer_general):
+            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
+        if isinstance(model, FP16_Optimizer_for_fused):
+            bad_optim_type = "apex.optimizers.FP16_Optimizer"
+        if bad_optim_type is not None:
+            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(optim_type) +
+                               "The optimizer(s) passed to amp.initialize() should be bare \n"
+                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
+                               "optimizers (currently just FusedAdam, but FusedSGD will be added \n"
+                               "soon).  You should not manually wrap your optimizer in either \n"
+                               "apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer. \n"
+                               "amp.initialize will take care of that for you (if necessary) based \n"
+                               "on the specified opt_level (and optional overridden properties)."
 def _initialize(models, optimizers, properties):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init
@@ -68,21 +104,11 @@ def _initialize(models, optimizers, properties):
    else:
        raise TypeError("models must be either a single model or a list of models.")
-    for model in models:
+    check_models(models)
-        parallel_type = None
-        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            parallel_type = "torch.nn.parallel.DistributedDataParallel"
-        if isinstance(model, apex_DDP):
-            parallel_type = "apex.parallel.DistributedDataParallel"
-        if isinstance(model, torch.nn.parallel.DataParallel):
-            parallel_type = "torch.nn.parallel.DataParallel"
-        if parallel_type is not None:
-            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
-                "Parallel wrappers should only be applied AFTER the model(s) have been "
-                "returned from amp.initialize.")
-    for model in models:
+    check_params_fp32(models)
-        check_params_fp32(model)
+    check_optimizers(optimizers)
    # Stash master weights before casting the model.
    # if properties.master_weights:
@@ -112,8 +138,10 @@ def _initialize(models, optimizers, properties):
    if properties.master_weights:
        for i, optimizer in enumerate(optimizers):
+            if isinstance(optimizer, FusedAdam):
+                optimizers[i] = wrap_fused_adam(optimizer, properties)
            if properties.loss_scale == "dynamic":
-                optimizers[i] = FP16_Optimizer(optimizers[i], dynamic_loss_scale=True)
+                optimizers[i] = FP16_Optimizer_general(optimizers[i], dynamic_loss_scale=True)
            else:
                optimizers[i] = FP16_Optimizer(optimizers[i], static_loss_scale=properties.loss_scale)
    else:
@@ -121,6 +149,7 @@ def _initialize(models, optimizers, properties):
            optimizer.loss_scaler = LossScaler(properties.loss_scale)
    if properties.patch_torch_functions:
+        # handle is unused here. It's accessible later through a global value anyway.
        handle = amp_init(loss_scale=properties.loss_scale)
    if optimizers_was_list:

--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
@@ -26,7 +26,7 @@ except TypeError as err:
 class FP16_Optimizer(object):
    """
    :class:`FP16_Optimizer` A cutdown version of apex.fp16_utils.FP16_Optimizer.
-    Design to be used in the same way but support only fused optimizers in apex.
+    Designed only to wrap apex.optimizers.FusedAdam.
    Refer to apex.fp16_utils documents for more information.
    Example::
@@ -179,7 +179,7 @@ class FP16_Optimizer(object):
    def backward(self, loss):
        """
-        :attr:`backward` performs the following conceptual steps:
+        :attr:`backward` performs the following steps:
        1. fp32_loss = loss.float()
        2. scaled_loss = fp32_loss*loss_scale

--- a/csrc/multi_tensor_apply.cuh
+++ b/csrc/multi_tensor_apply.cuh
@@ -112,9 +112,9 @@ void multi_tensor_apply(
        else
        {
          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
          for(int d = 0; d < depth; d++)
            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
-          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
          loc_tensor_info = 1;
        }
      }

--- a/tests/run_fp16_optimizer/__init__.py
+++ b/tests/run_fp16_optimizer/__init__.py
--- a/tests/run_fp16_optimizer/test_fp16_optimizer.py
+++ b/tests/run_fp16_optimizer/test_fp16_optimizer.py
-import unittest
-import functools as ft
-import itertools as it
-import torch
-from apex.fp16_utils import FP16_Optimizer
-# Currently no-ops (tested via examples).
-# FP16_Optimizer to be deprecated and moved under unified Amp API.
-class TestFP16Optimizer(unittest.TestCase):
-    def setUp(self):
-        N, D_in, D_out = 64, 1024, 16
-        self.N = N
-        self.D_in = D_in
-        self.D_out = D_out
-        self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
-        self.y = torch.randn((N, D_out), dtype=torch.float16, device='cuda')
-        self.model = torch.nn.Linear(D_in, D_out).cuda().half()
-    # def tearDown(self):
-    #     pass
-    def test_minimal(self):
-        pass
-    def test_minimal_static(self):
-        pass
-    def test_minimal_dynamic(self):
-        pass
-    def test_closure(self):
-        pass
-    def test_closure_dynamic(self):
-        pass
-    def test_save_load(self):
-        pass
-if __name__ == '__main__':
-    unittest.main()