Docstring updates

17e8a552 · Michael Carilli · ea7c2098 · 17e8a552 · 17e8a552 · 17e8a552
Commit 17e8a552 authored Aug 27, 2019 by Michael Carilli
4 changed files
--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -6,14 +6,15 @@ class FusedAdam(torch.optim.Optimizer):

    """Implements Adam algorithm.

-      Currently GPU-only.  Requires Apex to be installed via
+    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

-    This version of fused Adam implements 2 fusions:
-      - Fusion of the Adam update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused Adam implements 2 fusions.

-    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for torch.optim.Adam::
+      * Fusion of the Adam update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for ``torch.optim.Adam``::

        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
        ...
@@ -21,16 +22,17 @@ class FusedAdam(torch.optim.Optimizer):

    :class:`apex.optimizers.FusedAdam` may be used with or without Amp.  If you wish to use :class:`FusedAdam` with Amp,
    you may choose any `opt_level`::
+
        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.


    .. warning::
-        A previous version of :class:`FusedAdam` allowed a number of additional arguments to `step`.  These additional arguments
+        A previous version of :class:`FusedAdam` allowed a number of additional arguments to ``step``.  These additional arguments
        are now deprecated and unnecessary.

    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
@@ -8,9 +8,10 @@ class FusedLAMB(torch.optim.Optimizer):
    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

-    This version of fused LAMB implements 2 fusions:
-      - Fusion of the LAMB update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused LAMB implements 2 fusions.
+
+      * Fusion of the LAMB update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::

@@ -20,12 +21,13 @@ class FusedLAMB(torch.optim.Optimizer):

    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
    you may choose any `opt_level`::
+
        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

@@ -50,7 +52,7 @@ class FusedLAMB(torch.optim.Optimizer):
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 1.0)

-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ

--- a/apex/optimizers/fused_novograd.py
+++ b/apex/optimizers/fused_novograd.py
@@ -8,9 +8,10 @@ class FusedNovoGrad(torch.optim.Optimizer):
    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

-    This version of fused NovoGrad implements 2 fusions:
-      - Fusion of the NovoGrad update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused NovoGrad implements 2 fusions.
+
+      * Fusion of the NovoGrad update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.

    :class:`apex.optimizers.FusedNovoGrad`'s usage is identical to any Pytorch optimizer::

@@ -20,12 +21,13 @@ class FusedNovoGrad(torch.optim.Optimizer):

    :class:`apex.optimizers.FusedNovoGrad` may be used with or without Amp.  If you wish to use :class:`FusedNovoGrad` with Amp,
    you may choose any `opt_level`::
+
        opt = apex.optimizers.FusedNovoGrad(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.

    It has been proposed in `Jasper: An End-to-End Convolutional Neural Acoustic Model`_.
    More info: https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd

--- a/apex/optimizers/fused_sgd.py
+++ b/apex/optimizers/fused_sgd.py
@@ -6,14 +6,15 @@ from apex.multi_tensor_apply import multi_tensor_applier
 class FusedSGD(Optimizer):
    r"""Implements stochastic gradient descent (optionally with momentum).

-      Currently GPU-only.  Requires Apex to be installed via
+    Currently GPU-only.  Requires Apex to be installed via
    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.

-    This version of fused SGD implements 2 fusions:
-      - Fusion of the SGD update's elementwise operations
-      - A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    This version of fused SGD implements 2 fusions.

-    :class:`apex.optimizers.FusedSGD` may be used as a drop-in replacement for torch.optim.SGD::
+      * Fusion of the SGD update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``::

        opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
        ...
@@ -21,12 +22,13 @@ class FusedSGD(Optimizer):

    :class:`apex.optimizers.FusedSGD` may be used with or without Amp.  If you wish to use :class:`FusedSGD` with Amp,
    you may choose any `opt_level`::
+
        opt = apex.optimizers.FusedSGD(model.parameters(), lr = ....)
        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
        ...
        opt.step()

-    In general, `opt_level="O1"` is recommended.
+    In general, ``opt_level="O1"`` is recommended.

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.