[fix] optim/oss: support optimizers with additional step kwargs (#53)

* [fix] optim/oss: support optimizers with additional step kwargs Some of the optimizers in apex support additional kwargs to step such as scale.

[fix] optim/oss: support optimizers with additional step kwargs (#53)
* [fix] optim/oss: support optimizers with additional step kwargs Some of the optimizers in apex support additional kwargs to step such as scale.
09028a0d · msbaines · GitHub · 5251a69a · 09028a0d · 09028a0d
Unverified Commit 09028a0d authored Aug 26, 2020 by msbaines Committed by GitHub Aug 26, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 3 deletions

.isort.cfg .isort.cfg +1 -1

fairscale/optim/oss.py fairscale/optim/oss.py +4 -2

tests/optim/test_oss.py tests/optim/test_oss.py +16 -0

No files found.
--- a/.isort.cfg
+++ b/.isort.cfg
 [settings]
-known_third_party =numpy,pytest,setuptools,torch,torchtext
+known_third_party =numpy,pytest,setuptools,torch,torchtext,torchvision
--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -91,9 +91,11 @@ class OSS(Optimizer):
                    param_groups[rank].append(param_group_rank)
        return param_groups

-    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+    # NOTE(msb) We add a kwargs in order to support Optimizer sub-classes that support extra kwargs.
+    # For example, the apex library contains fused optimizers with a step that supports extra kwargs.
+    def step(self, closure: Optional[Callable[[], float]] = None, **kwargs: Any) -> Optional[float]:
        # Run the optimizer step on this shard only
-        loss = self.optim.step(closure=closure)
+        loss = self.optim.step(closure=closure, **kwargs)  # type: ignore

        # Sync all the states
        for rank, param_groups in enumerate(self.partition_parameters()):

--- a/tests/optim/test_oss.py
+++ b/tests/optim/test_oss.py
@@ -66,6 +66,22 @@ def test_state_dict():
    assert o.param_groups[0]["params"][0].device == x.device


+class SGDWithStepKWArg(torch.optim.SGD):
+    def step(self, closure=None, kwarg=[]):
+        super().step()
+        kwarg.append(5)
+
+
+def test_step_with_kwargs():
+    kwarg = []
+    x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
+    o = optim.OSS([x], SGDWithStepKWArg, lr=0.1)
+    x.backward()
+    o.step(0, kwarg=kwarg)
+    assert kwarg == [5]
+    assert x == torch.tensor([0.9], device=DEVICE)
+
+
 def test_local_state_dict():
    x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
    o = optim.OSS([x], lr=0.1)