[fix] OSS: Eager gradient release - free memory (#120)

* minor, but gives some memory back * adjust CI and regression checks to 4 gpu

[fix] OSS: Eager gradient release - free memory (#120)
* minor, but gives some memory back * adjust CI and regression checks to 4 gpu
1c2a6f6b · Benjamin Lefaudeux · GitHub · 79ded821 · 1c2a6f6b · 1c2a6f6b
Unverified Commit 1c2a6f6b authored Sep 30, 2020 by Benjamin Lefaudeux Committed by GitHub Sep 30, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 1 deletion

.circleci/config.yml .circleci/config.yml +1 -1

fairscale/optim/oss.py fairscale/optim/oss.py +13 -0

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -100,7 +100,7 @@ run_oss_benchmark: &run_oss_benchmark
  - run:
      name: Run OSS Benchmark
      command: |
-        python benchmarks/oss.py --check_regression
+        python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 21.2 --reference_memory 4220 --reference_loss 0.63
        python benchmarks/oss.py --gloo --optim_type oss



--- a/fairscale/optim/oss.py
+++ b/fairscale/optim/oss.py
@@ -157,6 +157,8 @@ class OSS(Optimizer):
        self._sync_param_groups()

        # Run the optimizer step on this shard only:
+        self._free_other_grads()
+
        if closure is not None:
            loss = self.optim.step(closure=closure, **kwargs)  # type: ignore
        else:
@@ -367,3 +369,14 @@ class OSS(Optimizer):
                # Discard this tensor/rank, broadcast necessary for syncing
                logging.debug("Discarding broadcast from rank %s", rank)
                broadcast_object(empty_buffer, src_rank=rank, group=self.group, dist_device=self._device)
+
+    def _free_other_grads(self) -> None:
+        """Free all the gradients only useful for the other ranks
+        """
+        for i, partition in enumerate(self.partition_parameters()):
+            if i == self.rank:
+                continue
+
+            for p in partition:
+                for t in p["params"]:
+                    t.grad = None