Commit d44ce75a authored by Michael Carilli's avatar Michael Carilli
Browse files

Documentation updates

parent 7f39db93
...@@ -2,3 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\ ...@@ -2,3 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\
register_half_function, register_float_function, register_promote_function register_half_function, register_float_function, register_promote_function
from .handle import scale_loss, disable_casts from .handle import scale_loss, disable_casts
from .frontend import initialize from .frontend import initialize
from ._amp_state import master_params
...@@ -15,3 +15,19 @@ def warn_or_err(msg): ...@@ -15,3 +15,19 @@ def warn_or_err(msg):
else: else:
raise RuntimeError(msg + " If you're sure you know what you're doing, supply " + raise RuntimeError(msg + " If you're sure you know what you're doing, supply " +
"hard_override=True to amp.initialize.") "hard_override=True to amp.initialize.")
# def iter_params(param_groups):
# for group in param_groups:
# for p in group['params']:
# yield p
def master_params(optimizer):
"""
Generator expression that iterates over the params owned by ``optimizer``.
Args:
optimizer: An optimizer previously returned from ``amp.initialize``.
"""
for group in optimizer.param_groups:
for p in group['params']:
yield p
...@@ -169,13 +169,60 @@ opt_levels = {"O3": O3(), ...@@ -169,13 +169,60 @@ opt_levels = {"O3": O3(),
# allow user to directly pass Properties struct as well? # allow user to directly pass Properties struct as well?
def initialize(models, optimizers, enabled=True, opt_level=None, **kwargs): def initialize(models, optimizers, enabled=True, opt_level=None, **kwargs):
""" """
Expected kwargs: Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
opt_level=None, chosen ``opt_level`` and overridden properties, if any.
cast_model_type=None,
patch_torch_functions=None, To prevent having to rewrite anything else in your script, name the returned models/optimizers
keep_batchnorm_fp32=None, to replace the passed models/optimizers, as in the Usage below.
master_weights=None,
loss_scale=None,) Args:
models (torch.nn.Module or list of torch.nn.Modules): Models to modify/cast.
optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
should run as if Amp were not present.
opt_level(str, required): Pure or mixed precision optimization level. Accepted values are
"O0", "O1", "O2", and "O3", which are explained in detail above.
cast_model_type (torch.dtype, optional, default=None): Optional property override, see
above.
patch_torch_functions (bool, optional, default=None): Optional property override.
keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
passed as a string, must be the string "True" or "False".
master_weights (bool, optional, default=None): Optional property override.
loss_scale(float or str, default=None): Optional property override. If passed as a string,
must be a string representing a number, e.g., "128.0", or the string "dynamic".
Returns:
Model(s) and optimizer(s) modified according to the ``opt_level``.
If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
also be a list.
Usage::
model, optim = amp.initialize(model, optim,...)
model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
[model1, model2], optim = amp.initialize([model1, model2], optim,...)
[model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
# This is not an exhaustive list of the cross product of options that are possible,
# just a set of examples.
model, optim = amp.initialize(model, optim, opt_level="O0")
model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
.. _`Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet
""" """
if not enabled: if not enabled:
if "hard_override" in kwargs: if "hard_override" in kwargs:
......
...@@ -5,8 +5,8 @@ import torch ...@@ -5,8 +5,8 @@ import torch
from . import utils from . import utils
from .opt import OptimWrapper from .opt import OptimWrapper
from .scaler import LossScaler, iter_params from .scaler import LossScaler
from ._amp_state import _amp_state from ._amp_state import _amp_state, master_params
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
...@@ -18,17 +18,44 @@ def scale_loss(loss, ...@@ -18,17 +18,44 @@ def scale_loss(loss,
model=None, model=None,
delay_unscale=False): delay_unscale=False):
""" """
On context manager entrance, scale the loss in a way consistent with the current loss scale. On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
Yield the loss ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
On context manager exit (if ``delay_unscale=False``), unscale the gradients so that with amp.scale_loss(loss, optimizer) as scaled_loss:
``optimizer.step()`` can be called. scaled_loss.backward()
On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
and unscaled, so that ``optimizer.step()`` can be called.
.. note:: .. note::
If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``) can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
any FP16 gradients are copied to FP32 master gradients before being unscaled. ``optimizer.step()`` any FP16 gradients are copied to FP32 master gradients before being unscaled.
will then apply the unscaled master gradients to the master params. ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
.. warning::
If Amp is using explicit FP32 master params, only the FP32 master gradients will be
unscaled. The direct ``.grad`` attributes of any FP16
model params will remain scaled after context manager exit.
This subtlety affects gradient clipping. See "Gradient clipping" under
"Advanced use cases" for best practices.
Args:
loss(Tensor): Typically a scalar Tensor. The ``scaled_loss`` that the context
manager yields is simply ``loss.float()*loss_scale``, so in principle
`loss` could have more than one element, as long as you call
``backward()`` on ``scaled_loss`` appropriately within the context manager body.
optimizer: **Must** be an optimizer returned from an earlier call to ``amp.initialize``.
model(torch.nn.Module, optional, default=None): Currently unused, reserved to enable future
optimizations.
delay_unscale(bool, default=False): Don't unscale the gradients or perform model->master
gradient copies on context manager exit. "Advanced use cases" illustrates
situations where this is necessary.
.. warning::If ``True``, ``optimizer.step()`` cannot be
called yet after context manager exit, and must wait for another, later backward context
manager invocation with ``delay_unscale`` left to False.
See "Advanced use cases" for examples.
""" """
if not _amp_state.opt_properties.enabled: if not _amp_state.opt_properties.enabled:
yield loss yield loss
...@@ -70,8 +97,8 @@ def scale_loss(loss, ...@@ -70,8 +97,8 @@ def scale_loss(loss,
else: else:
optimizer.loss_scaler.clear_overflow_state() optimizer.loss_scaler.clear_overflow_state()
optimizer.loss_scaler.unscale( optimizer.loss_scaler.unscale(
iter_params(optimizer.param_groups), master_params(optimizer),
iter_params(optimizer.param_groups), master_params(optimizer),
loss_scale) loss_scale)
# For future fused optimizers that enable sync-free dynamic loss scaling, # For future fused optimizers that enable sync-free dynamic loss scaling,
# should_skip will always be False. # should_skip will always be False.
...@@ -137,8 +164,8 @@ class AmpHandle(object): ...@@ -137,8 +164,8 @@ class AmpHandle(object):
self._default_scaler.clear_overflow_state() self._default_scaler.clear_overflow_state()
self._default_scaler.unscale( self._default_scaler.unscale(
iter_params(optimizer.param_groups), master_params(optimizer),
iter_params(optimizer.param_groups), master_params(optimizer),
loss_scale) loss_scale)
should_skip = self._default_scaler.update_scale() should_skip = self._default_scaler.update_scale()
if should_skip: if should_skip:
......
...@@ -2,7 +2,7 @@ import contextlib ...@@ -2,7 +2,7 @@ import contextlib
import logging import logging
import warnings import warnings
from .scaler import LossScaler, iter_params from .scaler import LossScaler, master_params
import numpy as np import numpy as np
...@@ -27,7 +27,7 @@ class OptimWrapper(object): ...@@ -27,7 +27,7 @@ class OptimWrapper(object):
# all mixed together. # all mixed together.
cached_grads = [] cached_grads = []
if self._loss_idx > 0: if self._loss_idx > 0:
for p in iter_params(self._optimizer.param_groups): for p in master_params(self._optimizer):
if p.grad is not None: if p.grad is not None:
cached_grads.append(p.grad.data.detach().clone()) cached_grads.append(p.grad.data.detach().clone())
else: else:
...@@ -39,14 +39,14 @@ class OptimWrapper(object): ...@@ -39,14 +39,14 @@ class OptimWrapper(object):
self._cur_loss_scaler().clear_overflow_state() self._cur_loss_scaler().clear_overflow_state()
self._cur_loss_scaler().unscale( self._cur_loss_scaler().unscale(
iter_params(self._optimizer.param_groups), master_params(self._optimizer),
iter_params(self._optimizer.param_groups), master_params(self._optimizer),
loss_scale) loss_scale)
self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale() self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
self._loss_idx += 1 self._loss_idx += 1
if len(cached_grads) > 0: if len(cached_grads) > 0:
for p, cached_grad in zip(iter_params(self._optimizer.param_groups), for p, cached_grad in zip(master_params(self._optimizer),
cached_grads): cached_grads):
if cached_grad is not None: if cached_grad is not None:
p.grad.data.add_(cached_grad) p.grad.data.add_(cached_grad)
......
import torch import torch
import logging import logging
from ..multi_tensor_apply import multi_tensor_applier from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state from ._amp_state import _amp_state, master_params
from itertools import product from itertools import product
# from apex_C import scale_check_overflow # from apex_C import scale_check_overflow
...@@ -172,8 +172,3 @@ class LossScaler(object): ...@@ -172,8 +172,3 @@ class LossScaler(object):
self._unskipped = 0 self._unskipped = 0
return should_skip return should_skip
def iter_params(param_groups):
for group in param_groups:
for p in group['params']:
yield p
...@@ -14,5 +14,3 @@ from .fp16util import ( ...@@ -14,5 +14,3 @@ from .fp16util import (
from .fp16_optimizer import FP16_Optimizer from .fp16_optimizer import FP16_Optimizer
from .loss_scaler import LossScaler, DynamicLossScaler from .loss_scaler import LossScaler, DynamicLossScaler
test = 1
...@@ -4,7 +4,10 @@ ...@@ -4,7 +4,10 @@
apex.amp apex.amp
=================================== ===================================
This page documents the update API for Amp (Automatic Mixed Precision), Unified API
-----------
This page documents the updated API for Amp (Automatic Mixed Precision),
a tool to enable Tensor Core-accelerated training in only 3 lines of Python. a tool to enable Tensor Core-accelerated training in only 3 lines of Python.
Amp allows users to easily experiment with different pure and mixed precision modes, including Amp allows users to easily experiment with different pure and mixed precision modes, including
...@@ -23,8 +26,12 @@ Amp can also be disabled, in which case the original script will behave exactly ...@@ -23,8 +26,12 @@ Amp can also be disabled, in which case the original script will behave exactly
In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit. In this way, there's no risk adhering to the Amp API, and a lot of potential performance benefit.
Example:: Example::
# Declare model and optimizer as usual
model = torch.nn.Linear(D_in, D_out).cuda().half() model = torch.nn.Linear(D_in, D_out).cuda().half()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# Allow Amp to perform casts as required by the opt_level
model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
... ...
# loss.backward() becomes: # loss.backward() becomes:
...@@ -35,11 +42,18 @@ Example:: ...@@ -35,11 +42,18 @@ Example::
A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found A `runnable, comprehensive Imagenet example`_ demonstrating good practices can be found
on the Github page. on the Github page.
DCGAN is a tricky case that many people have requested. A comprehensive example is under construction. GANs are a tricky case that many people have requested. A `comprehensive DCGAN example`_
is under construction.
``opt_level``\ s and Properties
-------------------------------
.. _`runnable, comprehensive Imagenet example`: .. _`runnable, comprehensive Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet https://github.com/NVIDIA/apex/tree/master/examples/imagenet
.. _`comprehensive DCGAN example`:
https://github.com/NVIDIA/apex/tree/master/examples/dcgan
.. automodule:: apex.amp .. automodule:: apex.amp
.. currentmodule:: apex.amp .. currentmodule:: apex.amp
...@@ -47,19 +61,23 @@ DCGAN is a tricky case that many people have requested. A comprehensive example ...@@ -47,19 +61,23 @@ DCGAN is a tricky case that many people have requested. A comprehensive example
.. autofunction:: scale_loss .. autofunction:: scale_loss
.. autofunction:: master_params
Advanced use cases Advanced use cases
------------------ ------------------
The new Amp API supports gradient accumulation across iterations, The new Amp API supports gradient accumulation across iterations,
multiple backward passes per iteration, multiple models/optimizers, multiple backward passes per iteration, multiple models/optimizers,
and forcing layers to a particular type. Further details can be found here: and custom/user-defined autograd functions. Gradient clipping and GANs also
require special treatment, but this treatment does not need to change
for different ``opt_level``\ s. Further details can be found here:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
advanced advanced
Transition Guide for Old API Users Transition guide for old API users
---------------------------------- ----------------------------------
We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof. The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time. We strongly encourage moving to the new Amp API, because it's more versatile, easier to use, and future proof. The original :class:`FP16_Optimizer` and the old "Amp" API are deprecated, and subject to removal at at any time.
...@@ -77,6 +95,7 @@ The functions formerly exposed through ``amp_handle`` are now free ...@@ -77,6 +95,7 @@ The functions formerly exposed through ``amp_handle`` are now free
functions accessible through the ``amp`` module. functions accessible through the ``amp`` module.
The backward context manager must be changed accordingly:: The backward context manager must be changed accordingly::
# old API # old API
with amp_handle.scale_loss(loss, optimizer) as scaled_loss: with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
...@@ -96,6 +115,7 @@ with a particular precision are still honored by the new API. ...@@ -96,6 +115,7 @@ with a particular precision are still honored by the new API.
``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``. ``opt-level O2`` is equivalent to :class:`FP16_Optimizer` with ``dynamic_loss_scale=True``.
Once again, the backward pass must be changed to the unified version:: Once again, the backward pass must be changed to the unified version::
optimizer.backward(loss) optimizer.backward(loss)
-> ->
with amp.scale_loss(loss, optimizer) as scaled_loss: with amp.scale_loss(loss, optimizer) as scaled_loss:
...@@ -108,11 +128,11 @@ necessary in the new API. No matter what --opt-level ...@@ -108,11 +128,11 @@ necessary in the new API. No matter what --opt-level
you choose, you can and should simply build your model in the default FP32 format.** The new Amp you choose, you can and should simply build your model in the default FP32 format.** The new Amp
API will perform the right conversions during API will perform the right conversions during
``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level`` ``model, optimizer = amp.initialize(model, optimizer, opt_level=....)`` based on the ``--opt-level``
and any overridden flags. Floating point input data may be float or half, but you may as well just and any overridden flags. Floating point input data may be FP32 or FP16, but you may as well just
let it be float, because the ``model`` returned by ``amp.initialize`` will have its ``forward`` let it be FP16, because the ``model`` returned by ``amp.initialize`` will have its ``forward``
method patched to cast the input data appropriately. method patched to cast the input data appropriately.
.. note:: .. note::
Aside from the call to ``amp.initialize`` itself, it's never necessary to manually cast Aside from the call to ``amp.initialize`` itself, it's never necessary to manually cast
your model or data with the new API. Therefore, a script that adheres to the new API your model or data with the new API. Therefore, a script that adheres to the new API
can switch between different ``opt-level``s without having to make any other changes. can switch between different ``opt-level``\ s without having to make any other changes.
...@@ -37,11 +37,10 @@ Installation instructions can be found here: https://github.com/NVIDIA/apex#qui ...@@ -37,11 +37,10 @@ Installation instructions can be found here: https://github.com/NVIDIA/apex#qui
layernorm layernorm
.. toctree:: .. .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: Deprecated mixed precision utilities :caption: Deprecated mixed precision API
fp16_util
fp16_utils
.. reparameterization .. reparameterization
.. RNN .. RNN
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment