[fix] repro+fix (#365)

fix a broken earlier commit, only worked for the first step

[fix] repro+fix (#365)
fix a broken earlier commit, only worked for the first step
8778fa66 · Benjamin Lefaudeux · GitHub · 4dc605c9 · 8778fa66 · 8778fa66
Unverified Commit 8778fa66 authored Feb 05, 2021 by Benjamin Lefaudeux Committed by GitHub Feb 05, 2021
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 8 deletions

fairscale/optim/oss.py fairscale/optim/oss.py +14 -8

tests/optim/test_oss.py tests/optim/test_oss.py +3 -0

No files found.
--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -8,7 +8,7 @@ import copy
 from itertools import chain
 import logging
 from math import inf
-from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, List, Optional, Type, Union
 import torch
 import torch.distributed as dist
@@ -81,7 +81,7 @@ class OSS(Optimizer):
        self._partition_parameters: List[List[dict]] = []
        self._index_to_param: Dict[int, torch.Tensor] = {}
        self._param_to_index: Dict[int, int] = {}
-        self._local_params: Optional[Iterable[Any]] = None
+        self._local_params: Optional[List[torch.Tensor]] = None
        # Build the wrapped optimizer, responsible for a shard of the params
        self.group = group if group is not None else dist.group.WORLD
@@ -145,14 +145,20 @@ class OSS(Optimizer):
        return self._partition_parameters
    @property
-    def local_params(self) -> Iterable[torch.Tensor]:
+    def local_params(self) -> List[torch.Tensor]:
+        """ Iterable which goes through the parameters that this rank owns
+        """
        if self._local_params is None:
-            self._local_params = chain(
+            self._local_params = list(
+                chain(
                    *[
                        list(filter(lambda x: x.grad is not None, device_params[self.rank]))
                        for device_params in self.per_device_params.values()
                    ]
                )
+            )
+        # Make sure that the iterator is not consumed, only expose a copy
        return self._local_params
    @property

--- a/tests/optim/test_oss.py
+++ b/tests/optim/test_oss.py
@@ -632,6 +632,9 @@ def run_gradient_clipping(rank, world_size, tempfile_name):
        print(f"Checking norm {norm}")
        check(norm)
+        # Check twice, catch an hypothetic iterator dumb mistake
+        check(norm)
    dist.destroy_process_group()