[docs]: add checkpoint_wrapper and many small fixes (#403)

* [docs]: add checkpoint_wrapper and many small fixes * update copyright year

[docs]: add checkpoint_wrapper and many small fixes (#403)
* [docs]: add checkpoint_wrapper and many small fixes * update copyright year
3f240fbb · Min Xu · GitHub · 175fdeb0 · 3f240fbb · 3f240fbb
Unverified Commit 3f240fbb authored Feb 19, 2021 by Min Xu Committed by GitHub Feb 19, 2021
10 changed files
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -32,7 +32,7 @@
  <meta property="og:description" content="{{ theme_variables.og['description'] }}">
  <!--<meta property="og:image" content="https://mmf.sh/img/logo.png">-->
  <!--<meta property="twitter:image" content="https://mmf.sh/img/logo.png">-->
-  <meta name="twitter:image:alt" content="Image for fairscale">
+  <meta name="twitter:image:alt" content="Image for FairScale">
  <meta name="twitter:card" content="summary_large_image">
  {# CSS #}
@@ -90,7 +90,7 @@
 <div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
-      <a class="header-logo" href="{{ theme_variables.external_urls['home'] }}" aria-label="fairscale">fairscale</a>
+      <a class="header-logo" href="{{ theme_variables.external_urls['home'] }}" aria-label="FairScale">FairScale</a>
      <div class="main-menu">
        <ul>

--- a/docs/source/_templates/theme_variables.jinja
+++ b/docs/source/_templates/theme_variables.jinja
@@ -11,6 +11,6 @@ set external_urls = {
 -%}
 {%-
 set og = {
-  'description': 'API docs for fairscale. fairscale is a PyTorch extension library for high performance and large scale training.'
+  'description': 'API docs for FairScale. FairScale is a PyTorch extension library for high performance and large scale training.'
 }
 -%}
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -9,3 +9,4 @@ API Reference
   optim/grad_scaler
   nn/pipe
   nn/sharded_ddp
+   nn/misc/checkpoint_activations
--- a/docs/source/api/nn/misc/checkpoint_activations.rst
+++ b/docs/source/api/nn/misc/checkpoint_activations.rst
+checkpoint_wrapper
+==================
+.. autoclass:: fairscale.nn.misc.checkpoint_wrapper
+    :members:
+    :undoc-members:
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,12 +24,12 @@ sys.path.insert(0, os.path.abspath("../.."))
 # -- Project information -----------------------------------------------------
-project = "fairscale"
+project = "FairScale"
-copyright = "2020, Facebook AI Research"
+copyright = "2020-2021, Facebook AI Research"
 author = "Facebook AI Research"
 # The full version, including alpha/beta/rc tags
-release = "0.0.2"
+release = "0.1.6"
 # -- General configuration ---------------------------------------------------

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
-.. fairscale documentation master file, created by
+.. FairScale documentation master file, created by
   sphinx-quickstart on Tue Sep  8 16:19:17 2020.
   You can adapt this file completely to your liking,
   but it should at least contain the root `toctree`
   directive.
-Welcome to fairscale's documentation!
+Welcome to FairScale's documentation!
 =====================================
-.. toctree::
+*FairScale* is a PyTorch extension library for high performance and
-   :maxdepth: 3
-   :caption: Contents:
-   :hidden:
-   tutorials/index
-   api/index
-*fairscale* is a PyTorch extension library for high performance and
 large scale training for optimizing training on one or across multiple
 machines/nodes. This library extend basic pytorch capabilities while
 adding new experimental ones.
@@ -35,6 +27,9 @@ Components
 * Optimization at scale:
   * `AdaScale SGD <../../en/latest/api/optim/adascale.html>`_
+* GPU memory optimization:
+   * `Activation checkpointing wrapper <../../en/latest/api/nn/misc/checkpoint_activations.html>`_
 * `Tutorials <../../en/latest/tutorials/index.html>`_
@@ -45,6 +40,14 @@ Components
    `issue <https://github.com/facebookresearch/fairscale/issues>`_
    if you have any trouble and/or suggestion.
+.. toctree::
+   :maxdepth: 5
+   :caption: Contents:
+   :hidden:
+   tutorials/index
+   api/index
 Reference
 =========

--- a/fairscale/__init__.py
+++ b/fairscale/__init__.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
+# Please update the doc version in docs/source/conf.py as well.
 __version__ = "0.1.6"
 ################################################################################

--- a/fairscale/nn/misc/checkpoint_activations.py
+++ b/fairscale/nn/misc/checkpoint_activations.py
@@ -19,10 +19,11 @@ def checkpoint_wrapper(module: nn.Module, offload_to_cpu: bool = False) -> nn.Mo
    A friendlier wrapper for performing activation checkpointing.
    Compared to the PyTorch version, this version:
-    - wraps an nn.Module, so that all subsequent calls will use checkpointing
-    - handles keyword arguments in the forward
+        - wraps an nn.Module, so that all subsequent calls will use checkpointing
-    - handles non-Tensor outputs from the forward
+        - handles keyword arguments in the forward
-    - supports offloading activations to CPU
+        - handles non-Tensor outputs from the forward
+        - supports offloading activations to CPU
    Usage::
@@ -30,8 +31,14 @@ def checkpoint_wrapper(module: nn.Module, offload_to_cpu: bool = False) -> nn.Mo
        a, b = checkpointed_module(x, y=3, z=torch.Tensor([1]))
    Args:
-        module (nn.Module): module to wrap
+        module (nn.Module):
-        offload_to_cpu (Optional, bool): whether to offload activations to CPU
+            module to wrap
+        offload_to_cpu (Optional, bool):
+            whether to offload activations to CPU
+    Returns:
+        (nn.Module):
+            wrapped module
    """
    module.forward = functools.partial(_checkpointed_forward, module.forward, offload_to_cpu)  # type: ignore
    return module

--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -58,7 +58,7 @@ class AdaScale(Optimizer):
    work with it. In other words, AdaScale is intended to be a complete wrapper of an
    torch Optimizer.
-    Note that, AdaScale does _not_ help increase per-GPU batch size.
+    Note that, AdaScale does *not* help increase per-GPU batch size.
    There are several ways to integrate AdaScale with your training loop.
    We show two examples below.

--- a/setup.py
+++ b/setup.py
@@ -68,9 +68,9 @@ if __name__ == "__main__":
        long_description="FairScale is a PyTorch extension library for high performance and large scale training on one or multiple machines/nodes. This library extends basic PyTorch capabilities while adding new experimental ones.",
        long_description_content_type="text/markdown",
        classifiers=[
-            "Programming Language :: Python :: 3.6",
            "Programming Language :: Python :: 3.7",
            "Programming Language :: Python :: 3.8",
+            "Programming Language :: Python :: 3.9",
            "License :: OSI Approved :: BSD License",
            "Topic :: Scientific/Engineering :: Artificial Intelligence",
            "Operating System :: OS Independent",