[docs] Revamp FairScale documentation (#698)

* add tutorials * add new context, modify and delete existing docs * remove duplicate labels * modify layout and more nits * address comments * fix merge conflicts

[docs] Revamp FairScale documentation (#698)
* add tutorials * add new context, modify and delete existing docs * remove duplicate labels * modify layout and more nits * address comments * fix merge conflicts
dcfb7a99 · anj-s · GitHub · 29aae007 · dcfb7a99 · dcfb7a99
Unverified Commit dcfb7a99 authored May 26, 2021 by anj-s Committed by GitHub May 26, 2021
20 changed files
--- a/docs/source/_static/css/customize.css
+++ b/docs/source/_static/css/customize.css
-.tutorials-header .header-logo  {
+/* Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. */
+/*
+ * some extra css to make markdown look similar between github/sphinx
+ */
+
+
+ .tutorials-header .header-logo  {
    background-image: url("../images/fairscale-logo-dark.svg");
+    background-repeat: no-repeat;
+    background-position: center;
 }

-.header-logo {
+/* .header-logo {
    background-image: url("../images/fairscale-logo.svg");
-}
+} */

-.footer-logo {
+/* .footer-logo {
    background-image: url("../images/fairscale-logo-icon.svg");
-}
+} */
--- a/docs/source/_static/img/ddp.png
+++ b/docs/source/_static/img/ddp.png
--- a/docs/source/_static/img/flowchart.png
+++ b/docs/source/_static/img/flowchart.png
--- a/docs/source/_static/img/fsdp.png
+++ b/docs/source/_static/img/fsdp.png
--- a/docs/source/_static/img/global.png
+++ b/docs/source/_static/img/global.png
--- a/docs/source/_static/img/offload.png
+++ b/docs/source/_static/img/offload.png
--- a/docs/source/_static/img/oss.png
+++ b/docs/source/_static/img/oss.png
--- a/docs/source/_static/img/pipe.png
+++ b/docs/source/_static/img/pipe.png
--- a/docs/source/_static/img/sdp.png
+++ b/docs/source/_static/img/sdp.png
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -90,12 +90,12 @@
 <div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
-      <a class="header-logo" href="{{ theme_variables.external_urls['home'] }}" aria-label="FairScale">FairScale</a>
+      <a class="header-logo" href="{{ theme_variables.external_urls['home'] }}"><img src="{{ pathto('_static/img/fairscale-logo.png', 1) }}" height="55px" width="250px"></a>

      <div class="main-menu">
        <ul>
          <li>
-            <a href="{{ theme_variables.external_urls['github'] }}">Github</a>
+            <a href="{{ theme_variables.external_urls['github'] }}"> FairScale Github</a>
          </li>
        </ul>
      </div>
@@ -224,13 +224,13 @@
        {%- endblock %}
        </div>

-        <div class="pytorch-content-right" id="pytorch-content-right">
+        <!-- <div class="pytorch-content-right" id="pytorch-content-right">
          <div class="pytorch-right-menu" id="pytorch-right-menu">
            <div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
              {{ toc }}
            </div>
          </div>
-        </div>
+        </div> -->
      </section>
    </div>


--- a/docs/source/api/experimental/nn/offload_model.rst
+++ b/docs/source/api/experimental/nn/offload_model.rst
-OffloadModel
-============
+Offload Model
+==============

 .. autoclass:: fairscale.experimental.nn.OffloadModel
    :members:

--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -6,11 +6,9 @@ API Reference

   optim/adascale
   optim/oss
-   optim/grad_scaler
   nn/moe
   nn/pipe
   nn/sharded_ddp
   nn/fsdp
-   nn/fsdp_tips
   nn/checkpoint/checkpoint_activations
   experimental/nn/offload_model
--- a/docs/source/api/nn/checkpoint/checkpoint_activations.rst
+++ b/docs/source/api/nn/checkpoint/checkpoint_activations.rst
-checkpoint_wrapper
-==================
+Activation Checkpoint
+======================

 .. autoclass:: fairscale.nn.checkpoint.checkpoint_wrapper
    :members:

--- a/docs/source/api/nn/fsdp.rst
+++ b/docs/source/api/nn/fsdp.rst
 Fully Sharded Data Parallel
 =======================================================

-See :doc:`FSDP Notes <fsdp_tips>` for a discussion of the principles behind ``FSDP`` and advanced usage.
-
-
 .. autoclass:: fairscale.nn.FullyShardedDataParallel
    :members:
    :undoc-members:
--- a/docs/source/api/nn/fsdp_tips.rst
+++ b/docs/source/api/nn/fsdp_tips.rst
-Fully Sharded Data Parallel Notes
-=======================================================
-This document describes how ``FSDP`` works, including subtle behaviors that can change performance significantly.
-See :doc:`this page <fsdp>` for python docstrings.
-
-Overview
---------
-
-Recent work by `Microsoft <https://arxiv.org/abs/1910.02054>`__ and
-`Google <https://arxiv.org/abs/2004.13336>`__ has shown that data
-parallel training can be made significantly more efficient by sharding
-the model parameters and optimizer state across data parallel workers.
-These ideas are encapsulated in the new  ``FullyShardedDataParallel``_
-(FSDP) wrapper, which is a drop-in replacement for the PyTorch
-``DistributedDataParallel`` (DDP) wrapper.
-
-Compared to PyTorch ``DistributedDataParallel``:
-
-* FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs
-* FSDP with ``reshard_after_forward=False`` has the same communication cost as PyTorch DDP and is similar to ZeRO-2
-* FSDP with ``reshard_after_forward=True`` increases total communication by 50% and is similar to ZeRO-3:
-    * all-gather parameters at start of forward pass and start of backward pass
-    * reduce-scatter grads at end of the backward pass
-* In practice, FSDP is faster than DDP because the optimizer step is sharded, and the extra communication can be overlapped with the forward pass.
-* FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs. When using the ``cpu_offload=True`` option, it's possible to train 1T parameter models on 256 GPUs.
-
-
-General usage notes
--------------------
-
-  For best memory efficiency use ``auto_wrap`` to wrap each layer in your network with ``FSDP`` and set ``reshard_after_forward=True``
-  For best training speed set ``reshard_after_forward=False`` (wrapping each layer is not required, but will improve speed further)
-  If you're using ``torch.cuda.amp.autocast`` for mixed precision, that's fully compatible with the FSDP wrapper, just set ``mixed_precision=True``
-  If combining with `activation checkpointing <https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/misc/checkpoint_activations.py>`__,
-   prefer ``FSDP(checkpoint_wrapper(module))`` over ``checkpoint_wrapper(FSDP(module))``. The latter will result in more communication and will be slower.
-  Results should be identical to DDP with pointwise Optimizers, e.g.,
-   Adam, AdamW, Adadelta, Adamax, SGD, etc.. However, the sharding will
-   result in slightly different results when using non-pointwise
-   Optimizers, e.g., Adagrad, Adafactor, LAMB, etc.
- In `fairseq <https://github.com/pytorch/fairseq>`_, FSDP is activated by the command line option ``--ddp-backend=fully_sharded``.
-
-How it works
------------
-In standard distributed data parallel (DDP) training every worker processes a separate batch and the gradients are
-summed across workers using an `all-reduce operation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce>`__.
-While DDP has become very popular, it wastes GPU memory because the model weights and optimizer states are replicated across all DDP workers.
-
-The key insight to unlock full parameter sharding is that we can decompose the
-`all-reduce <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce>`__
-operation in DDP into separate
-`all-gather <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather>`__
-and
-`reduce-scatter <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#reducescatter>`__
-operations:
-
-.. |Figure 1| image:: https://user-images.githubusercontent.com/23240128/110170085-a67b6280-7dc7-11eb-9128-88d813fc7037.png
-
-|Figure 1|
-
-Then, we can rearrange the reduce-scatter + all-gather so that each DDP worker only needs to store a single shard of parameters and optimizer state. The figure below illustrates standard DDP training (left) and fully sharded training (right):
-
-.. |Figure 2| image:: https://user-images.githubusercontent.com/231798/109069252-f9199800-76be-11eb-96f8-86767edf1eb9.png
-
-|Figure 2|
-
-To maximize memory efficiency we can discard the full weights after each
-layer's forward pass, saving memory for subsequent layers. This can be
-implemented by applying the FSDP wrapper to every layer in your network
-(with ``reshard_after_forward=True``). In pseudo-code:
-
-::
-
-    FSDP forward pass:
-        for layer_i in layers:
-            all-gather full weights for layer_i
-            forward pass for layer_i
-            discard full weights for layer_i
-    FSDP backward pass:
-        for layer_i in layers:
-            all-gather full weights for layer_i
-            backward pass for layer_i
-            discard full weights for layer_i
-            reduce-scatter gradients for layer_i
-
-Saving and Loading
------------------
-
-There are two ways to load and save FSDP instances,
-
- ``state_dict()`` returns a dictionary containing all parameters, which can be loaded with ``load_local_state_dict()``
- ``local_state_dict()`` returns a dictionary containing a shard's parameters, which can be loaded with ``load_state_dict()``
-
-
-Mixed Precision
---------------
-
-When ``mixed_precision=True``:
-
-  Sharded parameters are downcast to ``fp16`` before ``forward``, promoted to ``fp32`` after forward.
-  buffers are kept in ``fp16``, unless ``buffer_dtype=torch.float32`` is passed. Buffers are not sharded regardless of arguments.
-  By default, gradients will be computed and reduced in FP16. If FP32 reductions are important, set ``fp32_reduce_scatter=True``
-  If ``torch.amp.autocast`` is enabled it will override the output dtypes of some operations, like ``BatchNorm2D``
-
-
-Auto-wrap
-~~~~~~~~~
-Auto wrapping sub-modules with ``FSDP`` is a convenient way to improve training speed by overlapping the all-gather step across the forward passes of different submodules.
-It also improves memory efficiency by freeing gathered parameters after each layer finishes executing.
-
-
-
-.. code-block:: python
-
-    import torch
-    from fairscale.nn.wrap import auto_wrap, enable_wrap, wrap
-    from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
-    from fairscale.utils.testing import DummyProcessGroup
-    tfmr = torch.nn.Transformer(num_encoder_layers=2, num_decoder_layers=2)
-
-    group = DummyProcessGroup(rank=0, size=1)
-    fsdp_params = dict(mixed_precision=True, flatten_parameters=True)
-    with enable_wrap(wrapper_cls=FSDP, process_group=group, **fsdp_params):
-
-        # Wraps layer in FSDP by default if within context
-        l1 = wrap(torch.nn.Linear(5, 5))
-        assert isinstance(l1, FSDP)
-        assert l1.mixed_precision and l1.flatten_parameters
-        # Separately Wraps children modules with more than 1e8 params
-        tfmr_auto_wrapped = auto_wrap(tfmr, min_num_params=1e6)
-        assert isinstance(l2, nn.Transformer)
-        for l in l2.encoder.layers:
-            assert isinstance(l, FSDP)
-            assert l.mixed_precision and l.flatten_parameters
-            assert isinstance(l.linear1, FSDP)
-            assert isinstance(l.linear2, FSDP)
-            assert not isinstance(l.self_attn, FSDP) # self attention is not auto-wrapped
-
-
-.. warning:: It is not recommended to use :func:`auto_wrap` with
-    :class:`FullyShardedDataParallel` on modules that have shared
-    parameters, as the parameter sharing may be broken (i.e. end up not
-    shared) if the shared parameters are not (auto-)wrapped under the same
-    FSDP wrapper instance.
-
-
-Using CPU RAM
-------------
-
-``move_grads_to_cpu`` and ``cpu_offload`` control which tensors get
-moved to CPU.
-
-  ``cpu_offload`` moves weights to CPU when they are not being used.
- ``move_grads_to_cpu`` moves gradients to CPU so that the optimizer step also happens on CPU. This option requires ``cpu_offload=True``.
-
-Gradient Clipping
-----------------
-
-By default,
-
-.. code-block:: python
-
-    sharded_module = FullyShardedDataParallel(my_module)
-    torch.nn.utils.clip_grad_norm_(sharded_module.parameters(), max_norm=1.0)
-
-will use an incorrect norm (the norm over all params in a shard) when
-clipping gradients. To overcome this, you can either call
-``sharded_module.clip_grad_norm(1.0)`` which does the extra computation
-required to compute the norm properly, or use
-``torch.nn.utils.clip_grad_value_``.
-
-
-State Management with extra parameter attributes
------------------------------------------------
-
-We manage several attributes on each Parameter instance. The first two
-are set by :func:`_shard_parameters_`:
-
- ``_is_sharded``: ``True`` if the Parameter is sharded or ``False``
-    if the Parameter is intentionally not sharded (in which case we
-    will all-reduce grads for this param).
- ``_orig_size``: the size of the original Parameter (before sharding)
-
-
-The remaining attributes are set in ``_init_param_attributes()``:
-
- ``_fp32_shard``: a single shard of the parameters in full precision
-    (typically FP32, but this is dependent on the dtype of the model
-    as it's passed in by the user). This can be on CPU or GPU depending on the value of *``cpu_offload``*.
- ``_fp16_shard``: if ``mixed_precision`` is ``True``, this will be
-    a single shard of the parameters in FP16, used for all-gather.
- ``_full_param_padded``: the full weight (padded to be evenly divisible by ``world_size``), used for computation in the
-    forward and backward pass. This will be resized in place and only materialized (via all-gather) as needed.
-
-Misc
----
-  we don't start the FP32 -> FP16 transfer until after the optimization step completes.
- any direct weight accesses outside of the fwd/bwd, should be in the ``_summon_full_params`` context
-
--- a/docs/source/api/nn/sharded_ddp.rst
+++ b/docs/source/api/nn/sharded_ddp.rst
-ShardedDataParallel
-====================
+Sharded Data Parallel
+=====================

 .. autoclass:: fairscale.nn.ShardedDataParallel
    :members:
    :undoc-members:
-
-
-
-Performance tips
-====================
-Using OSS and ShardedDDP changes the communication pattern when compared to DDP, and depending on the training hardware a couple of changes can be beneficial.
-
-* If using multiple nodes, make sure that the reduce buckets are activated. This mitigates some of the communication latency cost
-* If using Torch AMP, the forward and backward passes are mostly computed in fp16, but by default the communications will still be fp32.
-    * ShardedDDP can compress the gradients back to fp16, using the `reduce_fp16` option.
-    * OSS can compress the model shards to fp16 when broadcasting, using the `broadcast_fp16` option. This could have a major effect on performance.
--- a/docs/source/api/optim/grad_scaler.rst
+++ b/docs/source/api/optim/grad_scaler.rst
-Sharded Grad Scaler
-========================
-Enabling PyTorch's automatic mixed precision usually means using a `GradScaler` to detect underflows.
-This grad scaler is not aware of the state sharding when Fairscale OSS is involved, and will lead to deadlocks.
-Make sure that you use `ShardedGradScaler` in that case, which is a shard-aware wrapper of PyTorch's implementation.
-
-.. code-block:: python
-
-    import torch
-    from fairscale.optim.oss import OSS
-    from fairscale.optim.grad_scaler import ShardedGradScaler
-    from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
-
-    def train(
-        rank: int,
-        world_size: int,
-        epochs: int):
-
-        # DDP
-        dist_init(rank, world_size)
-
-        # Problem statement
-        model = myAwesomeModel().to(rank)
-        dataloader = mySuperFastDataloader()
-        loss_ln = myVeryRelevantLoss()
-
-        # optimizer specific arguments e.g. LR, momentum, etc...
-        base_optimizer_arguments = { "lr": 1e-4}
-
-        # ** NEW ** Wrap a base optimizer into OSS
-        base_optimizer = torch.optim.SGD  # any pytorch compliant optimizer
-        optimizer = OSS(
-            params=model.parameters(),
-            optim=base_optimizer,
-            **base_optimizer_arguments)
-
-        # ** NEW ** Wrap the model into ShardedDDP
-        model = ShardedDDP(model, optimizer)
-
-        # ** NEW ** Use a ShardedGradScaler instead of the default Pytorch GradScaler
-        scaler = ShardedGradScaler()
-
-        # Any relevant training loop, nothing specific to OSS. For example:
-        model.train()
-        for e in range(epochs):
-            for (data, target) in dataloader:
-                data, target = data.to(rank), target.to(rank)
-
-                # Automatically computes the FW pass in half precision
-                with torch.cuda.amp.autocast():
-                    model.zero_grad()
-                    outputs = model(data)
-                    loss = loss_fn(outputs, target)
-
-                # Automatically handle scaled gradients
-                scaler.scale(loss).backward()
-                optimizer.step()
--- a/docs/source/api/optim/oss.rst
+++ b/docs/source/api/optim/oss.rst
-OSS
-====
+Optimizer State Sharding
+========================

 .. autoclass:: fairscale.optim.OSS
    :members:

--- a/docs/source/blogs_and_press.rst
+++ b/docs/source/blogs_and_press.rst
+Blogs and Press
+=================
+
+1. `Hugging Face with ZeRO <https://huggingface.co/blog/zero-deepspeed-fairscale>`_
+2. `Pytorch Lightening <https://medium.com/pytorch/pytorch-lightning-1-1-model-parallelism-training-and-more-logging-options-7d1e47db7b0b>`_
+3. `MMT <https://about.fb.com/news/2020/10/first-multilingual-machine-translation-model/>`_
+4. `SEER <https://ai.facebook.com/blog/seer-the-start-of-a-more-powerful-flexible-and-accessible-era-for-computer-vision/>`_
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -41,8 +41,38 @@ extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.autosectionlabel",
    "sphinx.ext.napoleon",  # support NumPy and Google style docstrings
+    "recommonmark",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.todo",
+    "sphinx.ext.coverage",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+    "sphinx.ext.doctest",
+    "sphinx.ext.ifconfig",
 ]

+# autosectionlabel throws warnings if section names are duplicated.
+# The following tells autosectionlabel to not throw a warning for
+# duplicated section names that are in different documents.
+autosectionlabel_prefix_document = True
+
+# -- Configurations for plugins ------------
+napoleon_google_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_special_with_doc = True
+napoleon_numpy_docstring = False
+napoleon_use_rtype = False
+autodoc_inherit_docstrings = False
+autodoc_member_order = "bysource"
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3.6", None),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
+# -------------------------
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]

@@ -51,6 +81,16 @@ templates_path = ["_templates"]
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns: List[Any] = []

+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = [".rst", ".md"]
+
+# The master toctree document.
+master_doc = "index"
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True

 # -- Options for HTML output -------------------------------------------------

@@ -58,6 +98,7 @@ exclude_patterns: List[Any] = []
 html_theme = "pytorch_sphinx_theme"
 templates_path = ["_templates"]

+
 # Add any paths that contain custom static files (such as style sheets) here,
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -67,17 +108,34 @@ html_theme_options = {
    "includehidden": True,
    "canonical_url": "https://fairscale.readthedocs.io",
    "pytorch_project": "docs",
+    "logo_only": True,  # default = False
 }

 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]

+# setting custom stylesheets https://stackoverflow.com/a/34420612
+html_context = {"css_files": ["_static/css/customize.css"]}
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "fairscaledocs"
+
+
 # Over-ride PyTorch Sphinx css
 def setup(app):
    app.add_config_value(
        "recommonmark_config",
-        {"url_resolver": lambda url: github_doc_root + url, "auto_toc_tree_section": "Contents"},
+        {
+            "url_resolver": lambda url: github_doc_root + url, 
+            "auto_toc_tree_section": "Contents",
+            "enable_math": True,
+            "enable_inline_math": True,
+            "enable_eval_rst": True,
+            "enable_auto_toc_tree": True,
+        },
        True,
    )
    app.add_transform(AutoStructify)