[fix] optim/oss: fix state cast (#56)

Workaround PyTorch bug that casts state (pytorch/pytorch#43706). Copied from https://github.com/pytorch/fairseq/blob/v0.9.0/fairseq/optim/fp16_optimizer.py#L251-L268

[fix] optim/oss: fix state cast (#56)
Workaround PyTorch bug that casts state (pytorch/pytorch#43706). Copied from https://github.com/pytorch/fairseq/blob/v0.9.0/fairseq/optim/fp16_optimizer.py#L251-L268
fb49b515 · msbaines · GitHub · e4a0804c · fb49b515 · fb49b515
Unverified Commit fb49b515 authored Aug 27, 2020 by msbaines Committed by GitHub Aug 27, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 0 deletions

fairscale/optim/oss.py fairscale/optim/oss.py +14 -0

stubs/torch/optim/optimizer.pyi stubs/torch/optim/optimizer.pyi +1 -0

No files found.
--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 import copy
+from itertools import chain
 import logging
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type
@@ -140,6 +141,19 @@ class OSS(Optimizer):
        self.optim.load_state_dict(state_dict)
+        # Workaround PyTorch bug that casts state (https://github.com/pytorch/pytorch/issues/43706)
+        # Copied from https://github.com/pytorch/fairseq/blob/v0.9.0/fairseq/optim/fp16_optimizer.py#L251-L268
+        groups = self.optim.param_groups
+        saved_groups = state_dict["param_groups"]
+        id_map = {
+            old_id: p
+            for old_id, p in zip(chain(*(g["params"] for g in saved_groups)), chain(*(g["params"] for g in groups)))
+        }
+        for k, v in state_dict["state"].items():
+            if k in id_map:
+                param = id_map[k]
+                self.optim.state[param] = recursive_copy_to_device(v, non_blocking=True, device=param.device)
    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
        """ Restore the global parameter groups as well as the shard """
        # Dispatch this rank's state dictionary to the wrapped shard optimizer

--- a/stubs/torch/optim/optimizer.pyi
+++ b/stubs/torch/optim/optimizer.pyi
@@ -7,6 +7,7 @@ _params_t = Union[Iterable[Tensor], Iterable[dict]]
 class Optimizer(object):
    param_groups: List[dict]
+    state: dict
    def __init__(self, params: _params_t, defaults: dict) -> None: ...
    def state_dict(self) -> dict: ...
    def load_state_dict(self, state_dict: dict) -> None: ...