[fix] better handling non-flatten in FSDP (#1072)

* [fix] better handling non-flatten in FSDP - see the detailed comment about that backward firing case - also minor debugging help in FSDP - also minor fix in FPW's state dict * [feat] disallow reset_parameters by default * [feat] adding fsdp_instances API - useful in check wrapping by user code * [fix] one line fix but more than a day of debugging * fixed the case of loading combined check with empty fsdp instances * fixed another bug around state loading the root/nonroot module full param caching due to not resharding after forward * [feat] support .half and .float better * fixed a bug in gather optim state losses extra keys from the original state_dict * fixed a test failure in mixed precision * fixed another bug affecting no_sync grad acc * fixed a bug and a test in fsdp optim state * fixed another corner case * added a comment * skip ssd offload tests * skip fsdp one for ssd overload Co-authored-by: Min Xu <min.xu.public@gmail.com>

[fix] better handling non-flatten in FSDP (#1072)
* [fix] better handling non-flatten in FSDP - see the detailed comment about that backward firing case - also minor debugging help in FSDP - also minor fix in FPW's state dict * [feat] disallow reset_parameters by default * [feat] adding fsdp_instances API - useful in check wrapping by user code * [fix] one line fix but more than a day of debugging * fixed the case of loading combined check with empty fsdp instances * fixed another bug around state loading the root/nonroot module full param caching due to not resharding after forward * [feat] support .half and .float better * fixed a bug in gather optim state losses extra keys from the original state_dict * fixed a test failure in mixed precision * fixed another bug affecting no_sync grad acc * fixed a bug and a test in fsdp optim state * fixed another corner case * added a comment * skip ssd offload tests * skip fsdp one for ssd overload Co-authored-by: Min Xu <min.xu.public@gmail.com>
429f3d31 · Min Xu · GitHub · 47ce21ac · 429f3d31 · 429f3d31
Unverified Commit 429f3d31 authored Sep 23, 2022 by Min Xu Committed by GitHub Sep 23, 2022
13 changed files
--- a/fairscale/experimental/tooling/layer_memory_tracker.py
+++ b/fairscale/experimental/tooling/layer_memory_tracker.py
@@ -500,7 +500,7 @@ class LayerwiseMemoryTracker:
        Indicate if x and y share the same storage, meaning that one of them
        is a view, reshape or stride of the other or from a common tensor
        """
-        return x.storage().data_ptr() == y.storage().data_ptr()  # type: ignore
+        return x.storage().data_ptr() == y.storage().data_ptr()
    @staticmethod
    def _collect_tensors(module_io_tensors: Union[torch.Tensor, Sequence[torch.Tensor]]) -> List[torch.Tensor]:

--- a/fairscale/nn/data_parallel/__init__.py
+++ b/fairscale/nn/data_parallel/__init__.py
@@ -12,6 +12,7 @@ from .fully_sharded_data_parallel import (
    OffloadConfig,
    TrainingState,
    auto_wrap_bn,
+    get_fsdp_instances,
    no_pre_load_state_dict_hook,
 )

--- a/fairscale/nn/data_parallel/fsdp_optim_utils.py
+++ b/fairscale/nn/data_parallel/fsdp_optim_utils.py
@@ -14,9 +14,6 @@ from fairscale.nn.misc import FlattenParamsWrapper
 if TYPE_CHECKING:
    from fairscale.nn.data_parallel import FullyShardedDataParallel
-# These return keys are used by fairseq. To change, add @sshleifer as a reviewer.
-UNFLAT_RETURN_KEYS = {"state", "param_groups", "uncollected_local_ids", "param_id_map"}
 # This function helps shard a full optimizer state dict
 def flatten_optim_state_dict(sd: Dict) -> Dict:
    """Shard a full optimizer state dict (called by FSDP.get_shard_from_optim_state_dict)"""
@@ -52,20 +49,24 @@ def flatten_optim_state_dict(sd: Dict) -> Dict:
            new_state[local_id][buffer_name] = torch.cat(tensors)
        new_state[local_id].update(non_tensor_state)
        new_state[local_id].update(singleton_state[local_id])
-    new_sd = {"state": new_state, "param_groups": copy.deepcopy(sd["param_groups"])}
-    for k in sd.keys():  # if there are extra keys, like loss_scale, don't delete them
-        if k not in UNFLAT_RETURN_KEYS:
-            new_sd[k] = copy.deepcopy(sd[k])
+    # Now make a new param_groups copy and update it.
+    new_sd_pg = copy.deepcopy(sd["param_groups"])
    # add pointers from the `params` dict.
    for pg_id, _ in enumerate(sd["param_groups"]):
        # The values() list may look like [0,0,None,None,2,2]. We use
        # groupby to remove the duplicates and then count the length of
        # resulting iter.
        num_local_params = sum(1 for _ in groupby(param_id_map.values()))
-        new_sd["param_groups"][pg_id]["params"] = list(range(num_local_params))
+        new_sd_pg[pg_id]["params"] = list(range(num_local_params))
-    return new_sd
+    # update the original sd so that we don't lose extra keys, like loss_scale.
+    sd["state"] = new_state
+    sd["param_groups"] = new_sd_pg
+    # delete extra keys we have added to match the original state.
+    del sd["uncollected_local_ids"]
+    del sd["param_id_map"]
+    return sd
 def check_param_counts_before_sharding(full_optim_state_dict: Dict, n_instances: int) -> None:
@@ -202,7 +203,7 @@ def build_unflat_state_dict(
    state: Dict[int, Dict[str, List[torch.Tensor]]],
    singleton_state: Dict[int, Dict[str, List[torch.Tensor]]],
    uncollected_opt_state: Dict[int, Dict],
-    param_groups: List[Dict],
+    original_sd: Dict,
 ) -> Dict:
    """Build an unflattened optimizer state dict given a list of flattened optimizer state dicts
    from each rank. This is only called on rank 0.
@@ -213,7 +214,7 @@ def build_unflat_state_dict(
        state: all-gathered combined/local/flatten state_dict
        singleton_state: all-gathered singleton_state (dimensionless tensors)
        uncollected_opt_state: non-tensor and not-gathered state
-        param_groups: the original rank 0's sd["param_groups"]
+        original_sd: the original rank 0's sd
    Returns:
        dict: an unflattened, nonsharded optimizer state, as if FSDP was not there.
@@ -228,19 +229,19 @@ def build_unflat_state_dict(
        singleton_state[local_id] = {buffer_name: [x] for buffer_name, x in v.items() if is_singleton_tensor(x)}
    # local ids are in the current state, global_ids will be in returned state.
    unflat_state, global_to_local_id = _unflatten_optim_state(state, instance_list, world_pad_info, singleton_state)
    # Since there are no tensors in param_groups, deepcopy is fine.
-    param_groups = copy.deepcopy(param_groups)
+    param_groups = copy.deepcopy(original_sd["param_groups"])
    # Casting needed only for mypy.
    num_params = sum([cast(int, m.num_params_managed) for m in instance_list])
    param_groups[0]["params"] = list(range(num_params))
-    unflat_optim_state_dict = {
-        "state": dict(sorted(unflat_state.items())),  # NOTE: this is probably already sorted
+    # Update the original sd so we don't loss extra state like loss_scale.
-        "param_id_map": global_to_local_id,
+    original_sd["state"] = dict(sorted(unflat_state.items()))  # NOTE: this is probably already sorted
-        "param_groups": param_groups,
+    original_sd["param_id_map"] = global_to_local_id
-        "uncollected_local_ids": list(uncollected_opt_state.keys()),
+    original_sd["param_groups"] = param_groups
-    }
+    original_sd["uncollected_local_ids"] = list(uncollected_opt_state.keys())
-    assert set(unflat_optim_state_dict.keys()) == UNFLAT_RETURN_KEYS
+    return original_sd
-    return unflat_optim_state_dict
 def is_singleton_tensor(x: Any) -> bool:

--- a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
+++ b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
--- a/fairscale/nn/misc/flatten_params_wrapper.py
+++ b/fairscale/nn/misc/flatten_params_wrapper.py
@@ -500,7 +500,11 @@ class FlattenParamsWrapper(nn.Module):
        # Unflatten the module automatically if the state_dict is non-flat.
        # Note, we check the flat_param_ prefix since custom names can be given and flat_param_0 is
        # not always in the state dict's key list.
-        if self.is_flattened and not any(k.startswith("flat_param_") for k in state_dict.keys()):
+        if (
+            self.num_params_managed > 0
+            and self.is_flattened
+            and not any(k.startswith("flat_param_") for k in state_dict.keys())
+        ):
            # This object is flatten but state_dict is not. So we unflatten and load.
            with self.unflatten_params():
                return super().load_state_dict(state_dict, strict)

--- a/stubs/torch/__init__.pyi
+++ b/stubs/torch/__init__.pyi
@@ -32,11 +32,9 @@ from . import utils as utils
 from . import jit as jit
 from . import fft as fft
-#MODIFIED BY TORCHGPIPE
 from . import backends
 from . import distributed
 from . import version
-#END
 class dtype:
    is_floating_point: builtins.bool
@@ -67,10 +65,8 @@ class device:
    type: str
    index: _int
-#MODIFIED BY TORCHGPIPE
    @overload
    def __init__(self, device: device) -> None: ...
-#END
    @overload
    def __init__(self, device: Union[_int, str]) -> None: ...
@@ -78,17 +74,14 @@ class device:
    @overload
    def __init__(self, type: str, index: _int) -> None: ...
-#MODIFIED BY TORCHGPIPE
 class Size(tuple):
    def numel(self) -> _int: ...
-#END
-#MODIFIED BY TORCHGPIPE
 class Storage:
    def size(self) -> _int: ...
    def element_size(self) -> _int: ...
    def resize_(self, int) -> None: ...
-#END
+    def data_ptr(self) -> _int: ...
 # See https://github.com/python/mypy/issues/4146 for why these workarounds
 # is necessary
@@ -935,10 +928,8 @@ class Tensor:
    def unique_consecutive(self, sorted=True, return_inverse=False, return_counts=False, dim=None): ...
    def lu(self, pivot=True, get_infos=False): ...
-#MODIFIED BY TORCHGPIPE
    from .cuda import Stream
    def record_stream(self, stream: Optional[Stream]) -> None: ...
-#END
 @overload
 def __and__(self: Tensor, other: Number) -> Tensor: ...
@@ -1924,7 +1915,5 @@ def clear_autocast_cache() -> None: ...
 # possible to type correctly
 def nonzero(input: Tensor, *, out: Optional[Tensor]=None, as_tuple: Optional[_bool]=None): ...
-#MODIFIED BY TORCHGPIPE
 def is_grad_enabled() -> _bool: ...
 __version__: str = ...
-#END
--- a/stubs/torch/nn/modules/module.pyi
+++ b/stubs/torch/nn/modules/module.pyi
@@ -32,6 +32,7 @@ class Module(Generic[T_co]):
    def add_module(self, name: str, module: 'Module') -> None: ...
    def apply(self: T, fn: Callable[['Module'], None]) -> T: ...
+    def _apply(self: T, fn: Callable[['Module'], None]) -> T: ...
    def cuda(self: T, device: Optional[Union[int, str, device]] = ...) -> T: ...

--- a/tests/experimental/nn/test_ssd_offload.py
+++ b/tests/experimental/nn/test_ssd_offload.py
@@ -16,6 +16,8 @@ import numpy as np
 import pytest
 import torch
+pytestmark = pytest.mark.skip(reason="ssd offload to be removed to simplify the code")
 try:
    import fairscale.experimental.nn.ssd_offload as so
 except ImportError as ie:

--- a/tests/nn/data_parallel/test_fsdp.py
+++ b/tests/nn/data_parallel/test_fsdp.py
@@ -658,6 +658,25 @@ class TestModuleProperties(DistributedTest):
                torch.testing.assert_allclose(before_nm[1].shape, after_nm_original[1].cpu().shape)
+class TestResetParameters(DistributedTest):
+    def test_reset_parameters(self):
+        """Ensure that reduce_scatter_process_group same size with the world size."""
+        test_fn = functools.partial(self._test_reset, config={})
+        spawn_and_init(test_fn, world_sizes=[2])
+    @classmethod
+    def _test_reset(self, rank, group, config):
+        model = self._get_model(group, config)
+        with model.summon_full_params():
+            model.reset_parameters()
+    @classmethod
+    def _get_model(self, group, config):
+        with torch.no_grad():  # required for multiprocessing
+            model = nn.Linear(10, 10)
+            return FullyShardedDataParallel(model, group, allow_reset_parameters=True, **config)
 class TransformerWithSharedParams(nn.Module):
    def __init__(self, group, *unused_args, d_vocab=23, d_model=16, add_bn=True, **unused_kwargs):
        super().__init__()

--- a/tests/nn/data_parallel/test_fsdp_grad_acc.py
+++ b/tests/nn/data_parallel/test_fsdp_grad_acc.py
@@ -189,10 +189,10 @@ class TestGradAccCommunication(DistributedTest):
                        # the sum of the _base and public methods should stay the same.
                        assert (
                            mock_all_gather.call_count + mock_all_gather_base.call_count == expected_all_gather1
-                        ), f"{mock_all_gather.call_count +  mock_all_gather_base.call_count} != {expected_all_gather1}"
+                        ), f"{mock_all_gather.call_count} + {mock_all_gather_base.call_count} != {expected_all_gather1}"
                        assert (
                            mock_reduce_scatter.call_count + mock_reduce_scatter_base.call_count == 0
-                        ), f"{mock_reduce_scatter.call_count +  mock_reduce_scatter_base.call_count} != 0"
+                        ), f"{mock_reduce_scatter.call_count} + {mock_reduce_scatter_base.call_count} != 0"
                        output = model(*batch)
                        loss = model.module.get_loss(batch, output)
@@ -200,11 +200,11 @@ class TestGradAccCommunication(DistributedTest):
                        assert (
                            mock_all_gather.call_count + mock_all_gather_base.call_count == expected_all_gather2
-                        ), f"{mock_all_gather.call_count + mock_all_gather_base.call_count} != {expected_all_gather2}"
+                        ), f"{mock_all_gather.call_count} + {mock_all_gather_base.call_count} != {expected_all_gather2}"
                        assert (
                            mock_reduce_scatter.call_count + mock_reduce_scatter_base.call_count
                            == expected_reduce_scatter
-                        ), f"{mock_reduce_scatter.call_count + mock_reduce_scatter_base.call_count} != {expected_reduce_scatter}"
+                        ), f"{mock_reduce_scatter.call_count} + {mock_reduce_scatter_base.call_count} != {expected_reduce_scatter}"
 if __name__ == "__main__":

--- a/tests/nn/data_parallel/test_fsdp_offload.py
+++ b/tests/nn/data_parallel/test_fsdp_offload.py
@@ -16,6 +16,8 @@ import torch
 from torch import nn
 import torch.distributed
+pytestmark = pytest.mark.skip(reason="ssd offload to be removed to simplify the code")
 try:
    import fairscale.experimental.nn.ssd_offload as so
 except ImportError as ie:

--- a/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
+++ b/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
+import copy
 import functools
 from time import time
 import unittest
@@ -13,7 +14,7 @@ from torch.optim import SGD, Adadelta, Adam  # type: ignore
 from fair_dev.testing.testing import dist_init, objects_are_equal, spawn_for_all_world_sizes
 from fairscale.internal.params import recursive_copy_to_device
-from fairscale.nn import FullyShardedDataParallel
+from fairscale.nn.data_parallel import FullyShardedDataParallel, get_fsdp_instances
 from fairscale.nn.data_parallel.fsdp_optim_utils import is_singleton_tensor
 from .test_fsdp import (
@@ -158,9 +159,9 @@ class TestOptimizerUtils(DistributedTest):
        unwrapped_sd = optim_unwrapped.state_dict()
        if not transformer and not expert_group:
-            no_broadcast_children = [x for x in fsdp._fsdp_instances() if x.no_broadcast_optim_state]
+            no_broadcast_children = [x for x in get_fsdp_instances(fsdp) if x.no_broadcast_optim_state]
            assert len(no_broadcast_children) == 1, f"Length of non shared params {len(no_broadcast_children)}"
-            assert fsdp._fsdp_instances()[-1].no_broadcast_optim_state
+            assert get_fsdp_instances(fsdp)[-1].no_broadcast_optim_state
        torch.cuda.empty_cache()
        cuda_gb_before = torch.cuda.memory_stats(fsdp.rank)["allocated_bytes.all.current"] / 1024**3
        tstart = time()
@@ -196,12 +197,15 @@ class TestOptimizerUtils(DistributedTest):
            )
            return
-        unflat_state = sd["state"]
        assert "uncollected_local_ids" in sd
-        shard_sd = fsdp.get_shard_from_optim_state_dict(sd)
+        sd_copy = copy.deepcopy(sd)
+        unflat_state = sd_copy["state"]
+        shard_sd = fsdp.get_shard_from_optim_state_dict(sd_copy)
        shard_sd = recursive_copy_to_device(shard_sd, non_blocking=False, device="cpu")
-        state_after_get_shard = sd["state"]
+        state_after_get_shard = sd_copy["state"]
-        assert objects_are_equal(unflat_state, state_after_get_shard)  # no side effects.
+        # sd is changed in-place in case there are extra states.
+        assert not objects_are_equal(unflat_state, state_after_get_shard)
+        del sd_copy
        assert_equal(len(sd["state"]), len(unwrapped_sd["state"]))
        assert_equal(len(sd["param_groups"][0]["params"]), len(unwrapped_sd["param_groups"][0]["params"]))
@@ -223,8 +227,8 @@ class TestOptimizerUtils(DistributedTest):
            [v for k, v in shard_sd["param_groups"][0].items()],
            [v for k, v in original_shard_sd["param_groups"][0].items()],
        )
-        assert objects_are_equal(shard_sd["state"], original_shard_sd["state"])
+        objects_are_equal(shard_sd["state"], original_shard_sd["state"], raise_exception=True)
-        assert objects_are_equal({k: shard_sd[k] for k in original_shard_sd}, original_shard_sd)
+        objects_are_equal({k: shard_sd[k] for k in original_shard_sd}, original_shard_sd, raise_exception=True)
    @parameterized.expand(
        [(True,), (False,)],
@@ -260,7 +264,7 @@ class TestOptimizerUtils(DistributedTest):
        model = TransformerWithSharedParams(group)
        named_pars = [p for n, p in model.named_parameters()]
        for i, p in enumerate(model.parameters()):
-            assert objects_are_equal(p, named_pars[i])
+            objects_are_equal(p, named_pars[i], raise_exception=True)
    def test_is_singleton_tensor(self):
        """Test is_singleton_tensor function"""

--- a/tests/nn/data_parallel/test_fsdp_shared_weights_mevo.py
+++ b/tests/nn/data_parallel/test_fsdp_shared_weights_mevo.py
@@ -158,7 +158,8 @@ def _dist_worker(rank, world_size, files, wrap_middle, test_fn):
        # We don't raise exceptions in CI since CI's T4 machine seems to be flaky with this test.
        # On devel machines, we do want to catch potential errors. There could be real bugs or
        # system issues behind the flakiness. One example is all-reduce vs. simulated averaging
-        # below.
+        # below. The check also fails on my rtx 20xx. So maybe it only works on devfair with
+        # Quadro GP100 GPUs. TODO (Min): debug this.
        objects_are_equal(sd_after, fsdp_model.state_dict(), raise_exception=not in_circle_ci())
    elif test_fn == "eval":
        _eval(fsdp_model, in_data)