Allow sharded grad scaler to cpu offload with FSDP (#831)

* first commit * sharded scaler hitting nan assertions * adding test for sharded grad scaler without cpu offload * ddp grad scaler and fsdp sharded grad scaler test failing * removing test_output * fix no cpu offload test * changing optimizer from OSS to SGD * all tests passing, code cleanup pending * code cleanup * fix pyproject.toml * removing .isort.cfg * running isort linter * resolving isort issues * resolving black linter issue * resolving mypy issues * fix import statement * fix mypy error * modifying import statement * adding pytorch version requirement * fixing pytest skip test decorator * apply version guard for ShardedGradScaler * removing test_fsdp_grad_scaler * increasing num_epochs for ShardedGradScaler so that updates are not skipped * adding support for torch 1.8 * minor edit * [skip ci] more torch 1.8 changes * parametrizing the tests * cleanup code with linters * [skip ci] update doc string * [skip ci] addressing some more comments

Allow sharded grad scaler to cpu offload with FSDP (#831)
* first commit * sharded scaler hitting nan assertions * adding test for sharded grad scaler without cpu offload * ddp grad scaler and fsdp sharded grad scaler test failing * removing test_output * fix no cpu offload test * changing optimizer from OSS to SGD * all tests passing, code cleanup pending * code cleanup * fix pyproject.toml * removing .isort.cfg * running isort linter * resolving isort issues * resolving black linter issue * resolving mypy issues * fix import statement * fix mypy error * modifying import statement * adding pytorch version requirement * fixing pytest skip test decorator * apply version guard for ShardedGradScaler * removing test_fsdp_grad_scaler * increasing num_epochs for ShardedGradScaler so that updates are not skipped * adding support for torch 1.8 * minor edit * [skip ci] more torch 1.8 changes * parametrizing the tests * cleanup code with linters * [skip ci] update doc string * [skip ci] addressing some more comments
ba5785f7 · Anupam Bhatnagar · GitHub · 7d7edf6d · ba5785f7 · ba5785f7
Unverified Commit ba5785f7 authored Nov 15, 2021 by Anupam Bhatnagar Committed by GitHub Nov 15, 2021
10 changed files
--- a/fairscale/experimental/nn/distributed_pipeline/pipeline.py
+++ b/fairscale/experimental/nn/distributed_pipeline/pipeline.py
@@ -21,8 +21,8 @@ Device = Union[torch.device, int, str]
 def check_pytorch_version() -> None:
-    if torch_version() < (1, 9, 0):
+    if torch_version() < (1, 8, 0):
-        raise Exception("DistributedPipeline requires PyTorch version 1.9 or higher")
+        raise Exception("DistributedPipeline requires PyTorch version 1.8 or higher")
 MOVING_DENIED = TypeError(

--- a/fairscale/optim/grad_scaler.py
+++ b/fairscale/optim/grad_scaler.py
@@ -3,31 +3,75 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from collections import abc, defaultdict
+from enum import Enum
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, List, Optional, Union
+import warnings
 import torch
+from torch.cuda import FloatTensor  # type: ignore
 from torch.cuda.amp import GradScaler as TorchGradScaler
+from torch.cuda.amp.common import amp_definitely_not_available
 import torch.distributed as dist
 from torch.optim import Optimizer
+from torch.optim.sgd import SGD
-from .oss import OSS
+from fairscale.utils import torch_version
+class _GeneralMultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    This class adds the cpu option to the _MultiDeviceReplicator class in PyTorch grad_scaler.py.
+    https://pytorch.org/docs/stable/_modules/torch/cuda/amp/grad_scaler.html#GradScaler
+    """
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda or master_tensor.device.type == "xla" or master_tensor.device.type == "cpu"
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+    def get(self, device: torch.device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+def _refresh_per_optimizer_state() -> Dict:
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
 class GradScaler(TorchGradScaler):
    def _unscale_grads_(
-        self, optimizer: Optimizer, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool
+        self,
+        optimizer: Optimizer,
+        inv_scale: torch.Tensor,
+        found_inf: torch.Tensor,
+        allow_fp16: bool,
    ) -> Dict[torch.device, torch.Tensor]:
        return super()._unscale_grads_(optimizer, inv_scale, found_inf, True)
 class ShardedGradScaler(TorchGradScaler):
    """
-    A shard-aware :class:`GradScaler<torch.cuda.amp.GradScaler>`, to be used in conjunction with
+    A shard aware Grad Scaler which enables loss scaling with/without cpu_offload. This is a
-    :class:`OSS` and :class:`ShardedOptimizer`.
+    slight modification of the pytorch grad scaler.
+    https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
-    Interface and usecases are not changed, more explanations can be found in the corresponding pytorch
-    documentation https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
    """
    def __init__(
@@ -38,7 +82,7 @@ class ShardedGradScaler(TorchGradScaler):
        growth_interval: int = 2000,
        enabled: bool = True,
        process_group: Any = dist.group.WORLD,
-    ) -> None:
+    ):
        super().__init__(
            init_scale=init_scale,
            growth_factor=growth_factor,
@@ -46,28 +90,325 @@ class ShardedGradScaler(TorchGradScaler):
            growth_interval=growth_interval,
            enabled=enabled,
        )
-        self.display_warning = True
+        if enabled and amp_definitely_not_available():
+            warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            self._enabled = False
+        else:
+            self._enabled = enabled
+        if self._enabled:
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
            self.group = process_group
-    def unscale_(self, optimizer: Optimizer) -> None:
+    def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch.Tensor, abc.Iterable]:
-        # Could be a mistake, this scaler is supposed to work with ZeroRedundancyOptimizer only
+        """
-        if self.display_warning and not isinstance(optimizer, OSS):
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
-            logging.warning(
-                "ShardedGradScaler is to be used in combination with a sharded optimizer, this could not be checked"
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda or outputs.device.type == "xla" or outputs.device.type == "cpu"
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)  # type: ignore
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[_GeneralMultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
+        def apply_scale(val: Union[torch.Tensor, abc.Iterable]) -> Union[torch.Tensor, abc.Iterable]:
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda or val.device.type == "xla" or val.device.type == "cpu"
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)  # type: ignore
+                    assert self._scale is not None
+                    stash.append(_GeneralMultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+        return apply_scale(outputs)
+    # This function is required enable cpu based grad scaler. It is inspired from its corresponding CUDA
+    # implementation which can be found here
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L88
+    def _foreach_non_finite_check_and_unscale_cpu_(
+        self, grads: List, found_inf: torch.Tensor, inv_scale: torch.Tensor
+    ) -> None:
+        if len(grads) == 0:
+            return
+        assert inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."
+        assert found_inf.numel() == 1, "found_inf must be a 1-element tensor."
+        expected_device = grads[0].device
+        for tensor in grads[0]:
+            try:
+                assert tensor.device == expected_device, "grads must be on the same device"
+            except AssertionError:
+                logging.error("tensor device is %s and expected device is %s" % (tensor.device, expected_device))
+            # check for non_overlapping_and_dense doesn't exist in the python world
+            # as remarked here https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L108
+            # we assume tensor is not MTA(multi tensor apply) safe. iterate through each item regardless of dtype
+            if torch.isinf(tensor).any().item() is True or torch.isnan(tensor).any().item() is True:  # type: ignore
+                found_inf.data = torch.tensor([1.0])
+                break
+            else:
+                tensor.data *= inv_scale.item()
+    def _unscale_grads_(  # type: ignore
+        self, optimizer: SGD, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool = True
+    ) -> Dict[torch.device, torch.Tensor]:
+        per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf)
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    if grads[0].device.type == "cpu":
+                        self._foreach_non_finite_check_and_unscale_cpu_(
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
                        )
+                    else:
+                        torch._amp_foreach_non_finite_check_and_unscale_(  # type: ignore
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
+                        )
+        return per_device_found_inf._per_device_tensors
+    def unscale_(self, optimizer: SGD) -> None:  # type: ignore
+        if not self._enabled:
+            return
+        super()._check_scale_growth_tracker("unscale_")  # type: ignore
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
-        self.display_warning = False  # Only warn once
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
-        # Call the upstream unscale_ method which will only act on this rank's gradients
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-        super().unscale_(optimizer)
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, True)
+        optimizer_state["stage"] = OptState.UNSCALED
        # Synchronize the detected inf across the ranks
        optimizer_state = self._per_optimizer_states[id(optimizer)]
        last_handle = None
        for v in optimizer_state["found_inf_per_device"].values():
+            if v.device.type == "cpu":
+                v_on_cuda = v.cuda()
+                last_handle = dist.all_reduce(v_on_cuda, async_op=True, group=self.group)
+                v_on_cuda.cpu()
+            else:
                last_handle = dist.all_reduce(v, async_op=True, group=self.group)
        # Make sure that the calls are done before moving out.
        # The calls are executed in sequence, waiting for the last one is enough
        if last_handle is not None:
            last_handle.wait()
+    def step(self, optimizer: SGD, *args, **kwargs) -> Optional[float]:  # type: ignore
+        """
+        :meth:`step` carries out the following two operations:
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+        .. warning::
+            Closure use is not currently supported.
+        Note: This is an exact copy of the step function in grad_scaler.py. If this copy is deleted then the
+        unittest test_cpu_offload_and_cpu_grads fails. This is because the parent class step function calls
+        the parent class unscale_ function which does not handle torch.distributed.all_reduce on cpu.
+        """
+        if not self._enabled:
+            return optimizer.step(*args, **kwargs)
+        if "closure" in kwargs:
+            raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.")
+        self._check_scale_growth_tracker("step")  # type: ignore
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("step() has already been called since the last update().")
+        retval = None
+        if hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling:
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
+            optimizer_state["stage"] = OptState.STEPPED
+            return retval
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+        assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
+        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)  # type: ignore
+        optimizer_state["stage"] = OptState.STEPPED
+        return retval
+    # This function is required enable cpu based grad scaler. It is inspired from its corresponding CUDA
+    # implementation which can be found here
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L219
+    def _amp_update_scale_cpu_(self, found_inf):  # type: ignore
+        """
+        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
+        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
+        """
+        if found_inf.item() == 1.0:
+            self._scale *= self._backoff_factor  # type: ignore
+            self._growth_tracker = 0
+        else:
+            successful = self._growth_tracker + 1
+            if successful == self._growth_interval:  # type: ignore
+                self._scale *= self._growth_factor  # type: ignore
+                self._growth_tracker = 0
+            else:
+                self._growth_tracker = successful
+    def update(self, new_scale: Optional[Union[float, FloatTensor]] = None) -> None:
+        """
+        Updates the scale factor.
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")  # type: ignore
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+            if _scale.device.type == "cpu":
+                self._amp_update_scale_cpu_(found_inf_combined)  # type: ignore
+            else:
+                if torch_version() >= (1, 9, 0):
+                    torch._amp_update_scale_(  # type: ignore
+                        self._scale,
+                        self._growth_tracker,
+                        found_inf_combined,
+                        self._growth_factor,  # type: ignore
+                        self._backoff_factor,  # type: ignore
+                        self._growth_interval,  # type: ignore
+                    )
+                elif torch_version() >= (1, 8, 0) and torch_version() < (1, 9, 0):
+                    self._scale = torch._amp_update_scale(  # type: ignore
+                        self._growth_tracker,
+                        _scale,
+                        found_inf_combined,
+                        self._growth_factor,  # type: ignore
+                        self._backoff_factor,  # type: ignore
+                        self._growth_interval,  # type: ignore
+                    )
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
--- a/tests/ci_test_list_1.txt
+++ b/tests/ci_test_list_1.txt
@@ -3,7 +3,6 @@ tests/nn/data_parallel/test_fsdp_multiple_wrapping.py
 tests/nn/data_parallel/test_fsdp_freezing_weights.py
 tests/nn/data_parallel/test_fsdp_regnet.py
 tests/nn/data_parallel/test_fsdp_uneven.py
-tests/nn/data_parallel/test_fsdp_grad_scaler.py
 tests/nn/data_parallel/test_fsdp_grad_acc.py
 tests/nn/data_parallel/test_fsdp_summon_full_params.py
 tests/nn/data_parallel/test_fsdp_input.py

--- a/tests/nn/data_parallel/test_fsdp.py
+++ b/tests/nn/data_parallel/test_fsdp.py
@@ -13,6 +13,7 @@ import unittest
 from unittest import mock
 from parameterized import parameterized
+import pytest
 import torch
 from torch import nn
 import torch.distributed
@@ -29,6 +30,9 @@ from fairscale.utils.testing import (
    spawn_for_all_world_sizes,
 )
+if torch_version() >= (1, 8, 0):
+    from fairscale.optim.grad_scaler import ShardedGradScaler
 # How to use remote-pdb: https://gist.github.com/sshleifer/9d43351957179c13606e015b072927d4
 # All helper functions called by spawn must be either @classmethod, @staticmethod
@@ -49,7 +53,9 @@ class DistributedTest(unittest.TestCase):
        model_device = next(model.parameters()).device
        # use SGD with momentum instead of Adam, since Adam is scale invariant
        # and this makes it bad for tests
-        optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+        optim = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)
+        scaler = ShardedGradScaler()
        for _ in range(num_steps):
            optim.zero_grad()
            with torch.cuda.amp.autocast(enabled=autocast):
@@ -57,6 +63,7 @@ class DistributedTest(unittest.TestCase):
                input = model.module.get_input(torch.device("cuda"))
                output = model(*input)
                loss = model.module.get_loss(input, output).to(model_device)
+            loss = scaler.scale(loss)
            assert loss.dtype == torch.float32
            model.module.run_backward(loss)
            if norm_type is not None:
@@ -65,10 +72,10 @@ class DistributedTest(unittest.TestCase):
                    model.clip_grad_norm_(clip_norm, norm_type)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm, norm_type)
-            params = [p for p in model.parameters()]
+            scaler.step(optim)
-            print(f"params.device {params[0].device} param.grad.device {params[0].grad.device}")
+            scaler.update()
+        if hasattr(model, "assert_idle"):
-            optim.step()
+            model.assert_idle()
        if isinstance(model, FullyShardedDataParallel):
            model.assert_state(TrainingState.IDLE)
        return loss.detach()
@@ -308,21 +315,21 @@ class TestComparisonToPyTorchDDP(DistributedTest):
        # Test every combination of these options:
        spawn_and_init(functools.partial(self._test_identical_outputs, TransformerWithSharedParams, config))
-    def test_cpu_offload_and_cpu_grads(self):
+    # testing moving params to cpu while using full and mixed precision
-        # We don't test the False condition because that requires the optimizer to internally do
+    @parameterized.expand([(True,), (False,)], name_func=rename_test)
-        # the device transfer and PyTorch optimizers don't support this.
+    def test_cpu_offload_and_cpu_grads(self, mixed_precision):
-        config = {"mixed_precision": True, "cpu_offload": True, "move_grads_to_cpu": True}
+        config = {"mixed_precision": mixed_precision, "cpu_offload": True}
        test_fn = functools.partial(
            self._test_identical_outputs, TransformerWithSharedParams, config, use_cuda=False, lr=0.01
        )
        spawn_and_init(test_fn)
-    def test_cpu_offload_and_cpu_grads_no_mixed_precision(self):
+    # testing full and mixed precision on the gpu
-        # We don't test the False condition because that requires the optimizer to internally do
+    @parameterized.expand([(True,), (False,)], name_func=rename_test)
-        # the device transfer and PyTorch optimizers don't support this.
+    def test_no_cpu_offload_with_sharded_grad_scaler(self, mixed_precision):
-        config = {"mixed_precision": False, "cpu_offload": True, "move_grads_to_cpu": True}
+        config = {"mixed_precision": mixed_precision, "move_params_to_cpu": False}
        test_fn = functools.partial(
-            self._test_identical_outputs, TransformerWithSharedParams, config, use_cuda=False, lr=0.01
+            self._test_identical_outputs, TransformerWithSharedParams, config, use_cuda=True, lr=0.01
        )
        spawn_and_init(test_fn)
@@ -485,10 +492,10 @@ class TestSerialization(DistributedTest):
        optim.step()
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestHooks(DistributedTest):
    # Feel free to modify these tests as the implementation changes.
    # They aspire to make sure that backward hooks are registered and used
    @parameterized.expand([[True], [False]])
    def test_output_backward_hooks(self, cuda_first):
        fn = functools.partial(self._test_output_backward_hooks, cuda_first=cuda_first)
@@ -541,6 +548,7 @@ class TestHooks(DistributedTest):
        assert model._register_pre_backward_hooks.called
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestNoGrad(DistributedTest):
    @parameterized.expand(CONFIG_OPTIONS, name_func=rename_test)
    def test_transformer_parameterized(self, config):
@@ -568,6 +576,7 @@ class TestNoGrad(DistributedTest):
        assert objects_are_equal(ref_output, no_grad_output, raise_exception=True)
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestModuleProperties(DistributedTest):
    @parameterized.expand([[{"flatten_parameters": False}], [{"flatten_parameters": True}]], name_func=rename_test)
    def test_named_parameters(self, config):

--- a/tests/nn/data_parallel/test_fsdp_apply.py
+++ b/tests/nn/data_parallel/test_fsdp_apply.py
@@ -7,8 +7,11 @@ import functools
 import unittest
 from parameterized import parameterized
+import pytest
 import torch.nn as nn
+from fairscale.utils import torch_version
 from .test_fsdp import (
    CONFIG_OPTIONS,
    DistributedTest,
@@ -19,6 +22,7 @@ from .test_fsdp import (
 )
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestApply(DistributedTest):
    @parameterized.expand(CONFIG_OPTIONS, name_func=rename_test)
    def test_transformer_weight_init(self, config):

--- a/tests/nn/data_parallel/test_fsdp_grad_scaler.py
+++ b/tests/nn/data_parallel/test_fsdp_grad_scaler.py
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-# pylint: disable=missing-module-docstring
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-""" Test FSDP with grad scaler. """
-import os
-import random
-import pytest
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairscale.nn import FullyShardedDataParallel
-from fairscale.optim.grad_scaler import ShardedGradScaler
-from fairscale.utils.testing import skip_if_no_cuda
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    # Older version doesn't support autocast. Skip this file.
-    pytestmark = pytest.mark.skip
-# Mixed precision needs cuda.
-@skip_if_no_cuda
-def test_scaler_cpu_offload_breaks():
-    device = torch.device("cuda")
-    torch.cuda.set_device(0)
-    # Random port in case the next test run quickly, same port would cause conflict.
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(random.randint(2000, 3000))
-    torch.distributed.init_process_group(backend="nccl", rank=0, world_size=1)
-    try:
-        scaler = ShardedGradScaler()
-        model = FullyShardedDataParallel(nn.Linear(5, 5), cpu_offload=True, mixed_precision=True)
-        optim = torch.optim.SGD(model.parameters(), lr=1e-3)
-        input = torch.rand((1, 5), dtype=torch.float).to(device)
-        optim.zero_grad()
-        with autocast():
-            output = model(input)
-            loss = F.mse_loss(input, output)
-        scaler.scale(loss).backward()
-        # TODO (Min): Need to fix. Details in issue #421.
-        with pytest.raises(RuntimeError):
-            scaler.step(optim)
-            scaler.update()
-    finally:
-        # Clean-up is important or the next test in this file may fail to init the PG.
-        torch.distributed.destroy_process_group()
-        del os.environ["MASTER_ADDR"]
-        del os.environ["MASTER_PORT"]
--- a/tests/nn/data_parallel/test_fsdp_regnet.py
+++ b/tests/nn/data_parallel/test_fsdp_regnet.py
@@ -35,7 +35,6 @@ from torch.optim import SGD
 from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
 from fairscale.nn.data_parallel import TrainingState, auto_wrap_bn
-from fairscale.optim.grad_scaler import ShardedGradScaler
 from fairscale.utils import torch_version
 from fairscale.utils.testing import (
    dist_init,
@@ -47,6 +46,9 @@ from fairscale.utils.testing import (
    torch_cuda_version,
 )
+if torch_version() >= (1, 8, 0):
+    from fairscale.optim.grad_scaler import ShardedGradScaler
 # Const test params.
 #   Reduce iterations to 1 for debugging.
 #   Change world_size to 8 on beefy machines for better test coverage.
@@ -352,8 +354,8 @@ def _distributed_worker(
 @pytest.mark.parametrize("flatten", ["flatten", "no_flatten"])
 @pytest.mark.parametrize("sync_bn", ["none", "pytorch"])
 def test_regnet(temp_files, ddp_ref, precision, flatten, sync_bn):
-    if torch_version() < (1, 6, 0):
+    if torch_version() < (1, 8, 0):
-        pytest.skip("older pytorch doesn't support reduce_scatter")
+        pytest.skip("pytorch version >= 1.8.0 required")
    state_before, inputs, conv_bias, linear_bias, state_after = ddp_ref

--- a/tests/nn/data_parallel/test_fsdp_state_dict.py
+++ b/tests/nn/data_parallel/test_fsdp_state_dict.py
@@ -7,10 +7,12 @@ import functools
 import unittest
 from parameterized import parameterized
+import pytest
 import torch
 from torch import nn
 from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
+from fairscale.utils import torch_version
 from fairscale.utils.testing import dist_init, objects_are_equal, skip_if_cuda, teardown, temp_files_ctx
 from .test_fsdp import (
@@ -23,6 +25,7 @@ from .test_fsdp import (
 )
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestLocalStateDict(DistributedTest):
    @parameterized.expand([[True, True], [False, False]], name_func=rename_test)
    def test_load_local_state_dict(self, flatten_params, mixed_precision):
@@ -50,7 +53,9 @@ class TestLocalStateDict(DistributedTest):
            state_1_module_weight = model.module.state_dict()[weight_key]
            torch.testing.assert_allclose(state_1_weight, state_1_module_weight)
            torch.testing.assert_allclose(state_1_weight, model.module.embed_tokens.weight)
-        self._train_for_several_steps(model, 1, model.mixed_precision)
+        # increasing number of epochs from 1 to 6 for ShardedGradScaler to work properly.
+        # test fails for num_epochs < 6 since the updates are skipped due to gradient being inf.
+        self._train_for_several_steps(model, 6, model.mixed_precision)
        state_2 = model.local_state_dict()
        state_after_training = {k: v.cpu().clone() for k, v in state_2.items()}
@@ -69,6 +74,7 @@ class TestLocalStateDict(DistributedTest):
            raise AssertionError(f"params {unchanged} not changed after training")
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestSaveLoadStateDict(DistributedTest):
    @parameterized.expand([[False], [True]], name_func=rename_test)
    def test_calling_state_dict_twice_mixed_precision(self, mixed_precision):
@@ -178,6 +184,7 @@ class TestSaveLoadStateDict(DistributedTest):
            ), f"{key}, {ref_state_dict[key]} != {state_dict[key]}"
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestStateDictDeviceDtype(DistributedTest):
    @parameterized.expand([[False, False], [True, False], [True, True]], name_func=rename_test)
    def test_state_dict_device(self, mixed_precision, cpu_offload):

--- a/tests/nn/data_parallel/test_fsdp_summon_full_params.py
+++ b/tests/nn/data_parallel/test_fsdp_summon_full_params.py
@@ -8,8 +8,11 @@ import gc
 import unittest
 from parameterized import parameterized
+import pytest
 import torch
+from fairscale.utils.version import torch_version
 from .test_fsdp import CONFIG_OPTIONS, DistributedTest, rename_test, spawn_and_init
@@ -19,6 +22,7 @@ def get_cuda_mem():
    return torch.cuda.memory_allocated()
+@pytest.mark.skipif(torch_version() < (1, 8, 0), reason="pytorch version >= 1.8.0 required")
 class TestMemory(DistributedTest):
    @parameterized.expand(CONFIG_OPTIONS, name_func=rename_test)
    def test_memory(self, config):

--- a/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
+++ b/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
@@ -21,10 +21,12 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from fairscale.nn.data_parallel import ShardedDataParallel
 from fairscale.optim import OSS
-from fairscale.optim.grad_scaler import ShardedGradScaler
 from fairscale.utils import torch_version
 from fairscale.utils.testing import check_same_model_params, skip_if_no_cuda, skip_if_single_gpu, temp_files_ctx
+if torch_version() >= (1, 8, 0):
+    from fairscale.optim.grad_scaler import ShardedGradScaler
 """
 Check that ShardedDDP gets the same results as DDP in a variety of scenarii
 """
@@ -249,6 +251,8 @@ def test_ddp_parity(
    manual_reduction,
    multiple_fw,
 ):
+    if torch_version() < (1, 8, 0):
+        pytest.skip("pytorch version >= 1.8.0 required")
    if manual_reduction and change_train_graph:
        pytest.skip("Skipping changing model and grad accumulation combination, makes little sense")