Commit d44ce75a authored by Michael Carilli's avatar Michael Carilli
Browse files

Documentation updates

parent 7f39db93
......@@ -2,3 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\
register_half_function, register_float_function, register_promote_function
from .handle import scale_loss, disable_casts
from .frontend import initialize
from ._amp_state import master_params
......@@ -15,3 +15,19 @@ def warn_or_err(msg):
else:
raise RuntimeError(msg + " If you're sure you know what you're doing, supply " +
"hard_override=True to amp.initialize.")
# def iter_params(param_groups):
# for group in param_groups:
# for p in group['params']:
# yield p
def master_params(optimizer):
"""
Generator expression that iterates over the params owned by ``optimizer``.
Args:
optimizer: An optimizer previously returned from ``amp.initialize``.
"""
for group in optimizer.param_groups:
for p in group['params']:
yield p
......@@ -169,13 +169,60 @@ opt_levels = {"O3": O3(),
# allow user to directly pass Properties struct as well?
def initialize(models, optimizers, enabled=True, opt_level=None, **kwargs):
"""
Expected kwargs:
opt_level=None,
cast_model_type=None,
patch_torch_functions=None,
keep_batchnorm_fp32=None,
master_weights=None,
loss_scale=None,)
Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
chosen ``opt_level`` and overridden properties, if any.
To prevent having to rewrite anything else in your script, name the returned models/optimizers
to replace the passed models/optimizers, as in the Usage below.
Args:
models (torch.nn.Module or list of torch.nn.Modules): Models to modify/cast.
optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
should run as if Amp were not present.
opt_level(str, required): Pure or mixed precision optimization level. Accepted values are
"O0", "O1", "O2", and "O3", which are explained in detail above.
cast_model_type (torch.dtype, optional, default=None): Optional property override, see
above.
patch_torch_functions (bool, optional, default=None): Optional property override.
keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
passed as a string, must be the string "True" or "False".
master_weights (bool, optional, default=None): Optional property override.
loss_scale(float or str, default=None): Optional property override. If passed as a string,
must be a string representing a number, e.g., "128.0", or the string "dynamic".
Returns:
Model(s) and optimizer(s) modified according to the ``opt_level``.
If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
also be a list.
Usage::
model, optim = amp.initialize(model, optim,...)
model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
[model1, model2], optim = amp.initialize([model1, model2], optim,...)
[model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
# This is not an exhaustive list of the cross product of options that are possible,
# just a set of examples.
model, optim = amp.initialize(model, optim, opt_level="O0")
model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
.. _`Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet
"""
if not enabled:
if "hard_override" in kwargs:
......
......@@ -5,8 +5,8 @@ import torch
from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler, iter_params
from ._amp_state import _amp_state
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
......@@ -18,17 +18,44 @@ def scale_loss(loss,
model=None,
delay_unscale=False):
"""
On context manager entrance, scale the loss in a way consistent with the current loss scale.
Yield the loss
On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
On context manager exit (if ``delay_unscale=False``), unscale the gradients so that
``optimizer.step()`` can be called.
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
and unscaled, so that ``optimizer.step()`` can be called.
.. note::
If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
any FP16 gradients are copied to FP32 master gradients before being unscaled. ``optimizer.step()``
will then apply the unscaled master gradients to the master params.
If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
any FP16 gradients are copied to FP32 master gradients before being unscaled.
``optimizer.step()`` will then apply the unscaled master gradients to the master params.
.. warning::
If Amp is using explicit FP32 master params, only the FP32 master gradients will be
unscaled. The direct ``.grad`` attributes of any FP16
model params will remain scaled after context manager exit.
This subtlety affects gradient clipping. See "Gradient clipping" under
"Advanced use cases" for best practices.
Args:
loss(Tensor): Typically a scalar Tensor. The ``scaled_loss`` that the context
manager yields is simply ``loss.float()*loss_scale``, so in principle
`loss` could have more than one element, as long as you call
``backward()`` on ``scaled_loss`` appropriately within the context manager body.
optimizer: **Must** be an optimizer returned from an earlier call to ``amp.initialize``.
model(torch.nn.Module, optional, default=None): Currently unused, reserved to enable future
optimizations.
delay_unscale(bool, default=False): Don't unscale the gradients or perform model->master
gradient copies on context manager exit. "Advanced use cases" illustrates
situations where this is necessary.
.. warning::If ``True``, ``optimizer.step()`` cannot be
called yet after context manager exit, and must wait for another, later backward context
manager invocation with ``delay_unscale`` left to False.
See "Advanced use cases" for examples.
"""
if not _amp_state.opt_properties.enabled:
yield loss
......@@ -70,8 +97,8 @@ def scale_loss(loss,
else:
optimizer.loss_scaler.clear_overflow_state()
optimizer.loss_scaler.unscale(
iter_params(optimizer.param_groups),
iter_params(optimizer.param_groups),
master_params(optimizer),
master_params(optimizer),
loss_scale)
# For future fused optimizers that enable sync-free dynamic loss scaling,
# should_skip will always be False.
......@@ -137,8 +164,8 @@ class AmpHandle(object):
self._default_scaler.clear_overflow_state()
self._default_scaler.unscale(
iter_params(optimizer.param_groups),
iter_params(optimizer.param_groups),
master_params(optimizer),
master_params(optimizer),
loss_scale)
should_skip = self._default_scaler.update_scale()
if should_skip:
......
......@@ -2,7 +2,7 @@ import contextlib
import logging
import warnings
from .scaler import LossScaler, iter_params
from .scaler import LossScaler, master_params
import numpy as np
......@@ -27,26 +27,26 @@ class OptimWrapper(object):
# all mixed together.
cached_grads = []
if self._loss_idx > 0:
for p in iter_params(self._optimizer.param_groups):
for p in master_params(self._optimizer):
if p.grad is not None:
cached_grads.append(p.grad.data.detach().clone())
else:
cached_grads.append(None)
self._optimizer.zero_grad()
loss_scale = self._cur_loss_scaler().loss_scale()
yield loss * loss_scale
self._cur_loss_scaler().clear_overflow_state()
self._cur_loss_scaler().unscale(
iter_params(self._optimizer.param_groups),
iter_params(self._optimizer.param_groups),
master_params(self._optimizer),
master_params(self._optimizer),
loss_scale)
self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
self._loss_idx += 1
if len(cached_grads) > 0:
for p, cached_grad in zip(iter_params(self._optimizer.param_groups),
for p, cached_grad in zip(master_params(self._optimizer),
cached_grads):
if cached_grad is not None:
p.grad.data.add_(cached_grad)
......
import torch
import logging
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state
from ._amp_state import _amp_state, master_params
from itertools import product
# from apex_C import scale_check_overflow
......@@ -172,8 +172,3 @@ class LossScaler(object):
self._unskipped = 0
return should_skip
def iter_params(param_groups):
for group in param_groups:
for p in group['params']:
yield p
......@@ -14,5 +14,3 @@ from .fp16util import (
from .fp16_optimizer import FP16_Optimizer
from .loss_scaler import LossScaler, DynamicLossScaler
test = 1
......@@ -4,7 +4,10 @@
apex.amp
===================================
This page documents the update API for Amp (Automatic Mixed Precision),
Unified API
-----------
This page documents the updated API for Amp (Automatic Mixed Precision),
a tool to enable Tensor Core-accelerated training in only 3 lines of Python.
Amp allows users to easily experiment with different pure and mixed precision modes, including
......@@ -23,8 +26,12 @@ Amp can also be disabled, in which case the original script will behave exactly
In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit.
Example::
# Declare model and optimizer as usual
model = torch.nn.Linear(D_in, D_out).cuda().half()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# Allow Amp to perform casts as required by the opt_level
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
...
# loss.backward() becomes:
......@@ -35,11 +42,18 @@ Example::
A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found
on the Github page.
DCGAN is a tricky case that many people have requested. A comprehensive example is under construction.
GANs are a tricky case that many people have requested. A `comprehensive DCGAN example`_
is under construction.
``opt_level``\ s and Properties
-------------------------------
.. _`runnable, comprehensive Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet
.. _`comprehensive DCGAN example`:
https://github.com/NVIDIA/apex/tree/master/examples/dcgan
.. automodule:: apex.amp
.. currentmodule:: apex.amp
......@@ -47,19 +61,23 @@ DCGAN is a tricky case that many people have requested. A comprehensive example
.. autofunction:: scale_loss
.. autofunction:: master_params
Advanced use cases
------------------
The new Amp API supports gradient accumulation across iterations,
multiple backward passes per iteration, multiple models/optimizers,
and forcing layers to a particular type. Further details can be found here:
and custom/user-defined autograd functions. Gradient clipping and GANs also
require special treatment, but this treatment does not need to change
for different ``opt_level``\ s. Further details can be found here:
.. toctree::
:maxdepth: 1
advanced
Transition Guide for Old API Users
Transition guide for old API users
----------------------------------
We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof. The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time.
......@@ -77,6 +95,7 @@ The functions formerly exposed through ``amp_handle`` are now free
functions accessible through the ``amp`` module.
The backward context manager must be changed accordingly::
# old API
with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
......@@ -96,6 +115,7 @@ with a particular precision are still honored by the new API.
``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``.
Once again, the backward pass must be changed to the unified version::
optimizer.backward(loss)
->
with amp.scale_loss(loss, optimizer) as scaled_loss:
......@@ -108,11 +128,11 @@ necessary in the new API. No matter what --opt-level
you choose, you can and should simply build your model in the default FP32 format.** The new Amp
API will perform the right conversions during
``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level``
and any overridden flags. Floating point input data may be float or half, but you may as well just
let it be float, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
and any overridden flags. Floating point input data may be FP32 or FP16, but you may as well just
let it be FP16, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
method patched to cast the input data appropriately.
.. note::
Aside from the call to ``amp.initialize`` itself, it's never necessary to manually cast
your model or data with the new API. Therefore, a script that adheres to the new API
can switch between different ``opt-level``s without having to make any other changes.
can switch between different ``opt-level``\ s without having to make any other changes.
......@@ -37,11 +37,10 @@ Installation instructions can be found here: https://github.com/NVIDIA/apex#qui
layernorm
.. toctree::
:maxdepth: 1
:caption: Deprecated mixed precision utilities
fp16_utils
.. .. toctree::
:maxdepth: 1
:caption: Deprecated mixed precision API
fp16_util
.. reparameterization
.. RNN
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment