[doc] Minor additions to ShardedDDP docs (#299)

b202804a · Benjamin Lefaudeux · GitHub · 11beea69 · b202804a · b202804a
Unverified Commit b202804a authored Jan 08, 2021 by Benjamin Lefaudeux Committed by GitHub Jan 08, 2021
Showing with 20 additions and 6 deletions

fairscale/nn/data_parallel/sharded_ddp.py fairscale/nn/data_parallel/sharded_ddp.py +16 -2

fairscale/optim/adascale.py fairscale/optim/adascale.py +4 -3

fairscale/optim/oss.py fairscale/optim/oss.py +0 -1

No files found.
--- a/fairscale/nn/data_parallel/sharded_ddp.py
+++ b/fairscale/nn/data_parallel/sharded_ddp.py
@@ -23,8 +23,7 @@ from fairscale.optim.utils import Workhandle
 class ShardedDataParallel(nn.Module):
-    """
+    """ Wrap the model, and reduce the gradients to the right rank during the backward pass.
-    Wrap the model, and reduce the gradients to the right rank during the backward pass.
    - the partition is given by the sharded optimizer
    - wrap the base model with a model which knows where to reduce each gradient
@@ -46,6 +45,21 @@ class ShardedDataParallel(nn.Module):
            Synchronize the models in between the ranks when starting up. Not needed if each rank has the same seed,
            or the training restarts from a saved state
+    .. warning:
+        ShardedDDP implements gradient sharding, meaning that each rank only owns a unique shard of the model gradients
+        after the backward pass, in order to save memory and some communication bandwidth.
+    .. warning:
+        As a consequence of sharding, in case of gradient clipping, one has to use the `clip_grad_norm` exposed by
+        the `optimizer state sharding wrapper <fairscale.optim.OSS>`
+    .. warning:
+        As a consequence of sharding, after loss.backward() (or equivalent) each rank will have `None` in place of some param.grad
+    .. warning:
+        As a consequence of sharding, Pytorch and Apex AMP implementations will hang when used in conjunction with `ShardedDDP`.
+        One needs a `shard-aware grad scaler<ShardedGradScaler>`, which is proposed in `fairscale.optim.grad_scaler`, compatible with PytorchAMP.
    """
    def __init__(

--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -554,6 +554,7 @@ class AdaScale(Optimizer):
        `set_scale` needs to be called to update the scale as well.
        TODO (min): need a way of determine how much to increase the step size?
        TODO (min): have both `set_scale` and `set_num_gradients_to_accumulate`
        is hard to use and easy to make mistake. I think it is better
        to specific a specify a `base_scale`. But more discussion is

--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -239,7 +239,6 @@ class OSS(Optimizer):
        .. warning: This needs to be called on all ranks, since synchronization primitives will be used
-        .. warning: Model paralelism -groups other than world- are not yet supported
        """
        # Compute the max norm for this shards's worth of gradients