Documentation updates

d44ce75a · Michael Carilli · 7f39db93 · d44ce75a · d44ce75a · d44ce75a
Commit d44ce75a authored Mar 06, 2019 by Michael Carilli
9 changed files
--- a/apex/amp/__init__.py
+++ b/apex/amp/__init__.py
@@ -2,3 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\
    register_half_function, register_float_function, register_promote_function
 from .handle import scale_loss, disable_casts
 from .frontend import initialize
+from ._amp_state import master_params
--- a/apex/amp/_amp_state.py
+++ b/apex/amp/_amp_state.py
@@ -15,3 +15,19 @@ def warn_or_err(msg):
    else:
        raise RuntimeError(msg + "  If you're sure you know what you're doing, supply " +
                           "hard_override=True to amp.initialize.")
+
+# def iter_params(param_groups):
+#     for group in param_groups:
+#         for p in group['params']:
+#             yield p
+
+def master_params(optimizer):
+    """
+    Generator expression that iterates over the params owned by ``optimizer``.
+
+    Args:
+        optimizer: An optimizer previously returned from ``amp.initialize``.
+    """
+    for group in optimizer.param_groups:
+        for p in group['params']:
+            yield p
--- a/apex/amp/frontend.py
+++ b/apex/amp/frontend.py
@@ -169,13 +169,60 @@ opt_levels = {"O3": O3(),
 # allow user to directly pass Properties struct as well?
 def initialize(models, optimizers, enabled=True, opt_level=None, **kwargs):
    """
-    Expected kwargs:
-    opt_level=None,
-    cast_model_type=None,
-    patch_torch_functions=None,
-    keep_batchnorm_fp32=None,
-    master_weights=None,
-    loss_scale=None,)
+    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
+    chosen ``opt_level`` and overridden properties, if any.
+
+    To prevent having to rewrite anything else in your script, name the returned models/optimizers
+    to replace the passed models/optimizers, as in the Usage below.
+
+    Args:
+        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
+        optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
+        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
+            should run as if Amp were not present.
+        opt_level(str, required):  Pure or mixed precision optimization level.  Accepted values are
+            "O0", "O1", "O2", and "O3", which are explained in detail above.
+        cast_model_type (torch.dtype, optional, default=None):  Optional property override, see
+            above.
+        patch_torch_functions (bool, optional, default=None):  Optional property override.
+        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
+            passed as a string, must be the string "True" or "False".
+        master_weights (bool, optional, default=None):  Optional property override.
+        loss_scale(float or str, default=None):  Optional property override.  If passed as a string,
+            must be a string representing a number, e.g., "128.0", or the string "dynamic".
+
+    Returns:
+        Model(s) and optimizer(s) modified according to the ``opt_level``.
+        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
+        also be a list.
+
+    Usage::
+
+        model, optim = amp.initialize(model, optim,...)
+        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
+        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
+        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
+
+        # This is not an exhaustive list of the cross product of options that are possible,
+        # just a set of examples.
+        model, optim = amp.initialize(model, optim, opt_level="O0")
+        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
+
+        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
+        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
+
+        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
+        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
+        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
+
+        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
+        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
+        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
+
+    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
+
+    .. _`Imagenet example`:
+        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
    """
    if not enabled:
        if "hard_override" in kwargs:

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -5,8 +5,8 @@ import torch

 from . import utils
 from .opt import OptimWrapper
-from .scaler import LossScaler, iter_params
-from ._amp_state import _amp_state
+from .scaler import LossScaler
+from ._amp_state import _amp_state, master_params
 from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
 from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused

@@ -18,17 +18,44 @@ def scale_loss(loss,
               model=None,
               delay_unscale=False):
    """
-    On context manager entrance, scale the loss in a way consistent with the current loss scale.
-    Yield the loss
+    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
+    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::

-    On context manager exit (if ``delay_unscale=False``), unscale the gradients so that
-    ``optimizer.step()`` can be called.
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
+    and unscaled, so that ``optimizer.step()`` can be called.

    .. note::
-    If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
-    can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
-    any FP16 gradients are copied to FP32 master gradients before being unscaled.  ``optimizer.step()``
-    will then apply the unscaled master gradients to the master params.
+        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
+        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
+        any FP16 gradients are copied to FP32 master gradients before being unscaled.
+        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
+
+    .. warning::
+        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
+        unscaled.  The direct ``.grad`` attributes of any FP16
+        model params will remain scaled after context manager exit.
+        This subtlety affects gradient clipping.  See "Gradient clipping" under
+        "Advanced use cases" for best practices.
+
+    Args:
+        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
+            manager yields is simply ``loss.float()*loss_scale``, so in principle
+            `loss` could have more than one element, as long as you call
+            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
+        optimizer:  **Must** be an optimizer returned from an earlier call to ``amp.initialize``.
+        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
+            optimizations.
+        delay_unscale(bool, default=False):  Don't unscale the gradients or perform model->master
+            gradient copies on context manager exit.  "Advanced use cases" illustrates
+            situations where this is necessary.
+
+    .. warning::If ``True``, ``optimizer.step()`` cannot be
+            called yet after context manager exit, and must wait for another, later backward context
+            manager invocation with ``delay_unscale`` left to False.
+            See "Advanced use cases" for examples.
    """
    if not _amp_state.opt_properties.enabled:
        yield loss
@@ -70,8 +97,8 @@ def scale_loss(loss,
            else:
                optimizer.loss_scaler.clear_overflow_state()
                optimizer.loss_scaler.unscale(
-                    iter_params(optimizer.param_groups),
-                    iter_params(optimizer.param_groups),
+                    master_params(optimizer),
+                    master_params(optimizer),
                    loss_scale)
                # For future fused optimizers that enable sync-free dynamic loss scaling,
                # should_skip will always be False.
@@ -137,8 +164,8 @@ class AmpHandle(object):

        self._default_scaler.clear_overflow_state()
        self._default_scaler.unscale(
-            iter_params(optimizer.param_groups),
-            iter_params(optimizer.param_groups),
+            master_params(optimizer),
+            master_params(optimizer),
            loss_scale)
        should_skip = self._default_scaler.update_scale()
        if should_skip:

--- a/apex/amp/opt.py
+++ b/apex/amp/opt.py
@@ -2,7 +2,7 @@ import contextlib
 import logging
 import warnings

-from .scaler import LossScaler, iter_params
+from .scaler import LossScaler, master_params

 import numpy as np

@@ -27,26 +27,26 @@ class OptimWrapper(object):
        # all mixed together.
        cached_grads = []
        if self._loss_idx > 0:
-            for p in iter_params(self._optimizer.param_groups):
+            for p in master_params(self._optimizer):
                if p.grad is not None:
                    cached_grads.append(p.grad.data.detach().clone())
                else:
                    cached_grads.append(None)
            self._optimizer.zero_grad()
-        
+
        loss_scale = self._cur_loss_scaler().loss_scale()
        yield loss * loss_scale

        self._cur_loss_scaler().clear_overflow_state()
        self._cur_loss_scaler().unscale(
-            iter_params(self._optimizer.param_groups), 
-            iter_params(self._optimizer.param_groups), 
+            master_params(self._optimizer),
+            master_params(self._optimizer),
            loss_scale)
        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
        self._loss_idx += 1

        if len(cached_grads) > 0:
-            for p, cached_grad in zip(iter_params(self._optimizer.param_groups),
+            for p, cached_grad in zip(master_params(self._optimizer),
                                      cached_grads):
                if cached_grad is not None:
                    p.grad.data.add_(cached_grad)

--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
 import torch
 import logging
 from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import _amp_state
+from ._amp_state import _amp_state, master_params
 from itertools import product

 # from apex_C import scale_check_overflow
@@ -172,8 +172,3 @@ class LossScaler(object):
            self._unskipped = 0

        return should_skip
-
-def iter_params(param_groups):
-    for group in param_groups:
-        for p in group['params']:
-            yield p
--- a/apex/fp16_utils/__init__.py
+++ b/apex/fp16_utils/__init__.py
@@ -14,5 +14,3 @@ from .fp16util import (

 from .fp16_optimizer import FP16_Optimizer
 from .loss_scaler import LossScaler, DynamicLossScaler
-
-test = 1
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -4,7 +4,10 @@
 apex.amp
 ===================================

-This page documents the update API for Amp (Automatic Mixed Precision),
+Unified API
+-----------
+
+This page documents the updated API for Amp (Automatic Mixed Precision),
 a tool to enable Tensor Core-accelerated training in only 3 lines of Python.

 Amp allows users to easily experiment with different pure and mixed precision modes, including
@@ -23,8 +26,12 @@ Amp can also be disabled, in which case the original script will behave exactly
 In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit.

 Example::
+
+        # Declare model and optimizer as usual
        model = torch.nn.Linear(D_in, D_out).cuda().half()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+        # Allow Amp to perform casts as required by the opt_level
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        ...
        # loss.backward() becomes:
@@ -35,11 +42,18 @@ Example::
 A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found
 on the Github page.

-DCGAN is a tricky case that many people have requested.  A comprehensive example is under construction.
+GANs are a tricky case that many people have requested.  A `comprehensive DCGAN example`_
+is under construction.
+
+``opt_level``\ s and Properties
+-------------------------------

 .. _`runnable, comprehensive Imagenet example`:
    https://github.com/NVIDIA/apex/tree/master/examples/imagenet

+.. _`comprehensive DCGAN example`:
+    https://github.com/NVIDIA/apex/tree/master/examples/dcgan
+
 .. automodule:: apex.amp
 .. currentmodule:: apex.amp

@@ -47,19 +61,23 @@ DCGAN is a tricky case that many people have requested.  A comprehensive example

 .. autofunction:: scale_loss

+.. autofunction:: master_params
+
 Advanced use cases
 ------------------

 The new Amp API supports gradient accumulation across iterations,
 multiple backward passes per iteration, multiple models/optimizers,
-and forcing layers to a particular type.  Further details can be found here:
+and custom/user-defined autograd functions.  Gradient clipping and GANs also
+require special treatment, but this treatment does not need to change
+for different ``opt_level``\ s.  Further details can be found here:

 .. toctree::
   :maxdepth: 1

   advanced

-Transition Guide for Old API Users
+Transition guide for old API users
 ----------------------------------

 We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof.  The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time.
@@ -77,6 +95,7 @@ The functions formerly exposed through ``amp_handle`` are now free
 functions accessible through the ``amp`` module.

 The backward context manager must be changed accordingly::
+
    # old API
    with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
@@ -96,6 +115,7 @@ with a particular precision are still honored by the new API.

 ``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``.
 Once again, the backward pass must be changed to the unified version::
+
    optimizer.backward(loss)
    ->
    with amp.scale_loss(loss, optimizer) as scaled_loss:
@@ -108,11 +128,11 @@ necessary in the new API.  No matter what --opt-level
 you choose, you can and should simply build your model in the default FP32 format.**  The new Amp
 API will perform the right conversions during
 ``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level``
-and any overridden flags.  Floating point input data may be float or half, but you may as well just
-let it be float, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
+and any overridden flags.  Floating point input data may be FP32 or FP16, but you may as well just
+let it be FP16, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
 method patched to cast the input data appropriately.

 .. note::
    Aside from the call to ``amp.initialize`` itself, it's never necessary to manually cast
    your model or data with the new API.  Therefore, a script that adheres to the new API
-    can switch between different ``opt-level``s without having to make any other changes.
+    can switch between different ``opt-level``\ s without having to make any other changes.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -37,11 +37,10 @@ Installation instructions can be found here:  https://github.com/NVIDIA/apex#qui

   layernorm

-.. toctree::
-   :maxdepth: 1
-   :caption: Deprecated mixed precision utilities
-
-   fp16_utils
+..   .. toctree::
+     :maxdepth: 1
+     :caption: Deprecated mixed precision API
+     fp16_util

 ..   reparameterization
 ..   RNN