Unverified Commit 1c2a6f6b authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[fix] OSS: Eager gradient release - free memory (#120)

* minor, but gives some memory back
* adjust CI and regression checks to 4 gpu
parent 79ded821
......@@ -100,7 +100,7 @@ run_oss_benchmark: &run_oss_benchmark
- run:
name: Run OSS Benchmark
command: |
python benchmarks/oss.py --check_regression
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 21.2 --reference_memory 4220 --reference_loss 0.63
python benchmarks/oss.py --gloo --optim_type oss
......
......@@ -157,6 +157,8 @@ class OSS(Optimizer):
self._sync_param_groups()
# Run the optimizer step on this shard only:
self._free_other_grads()
if closure is not None:
loss = self.optim.step(closure=closure, **kwargs) # type: ignore
else:
......@@ -367,3 +369,14 @@ class OSS(Optimizer):
# Discard this tensor/rank, broadcast necessary for syncing
logging.debug("Discarding broadcast from rank %s", rank)
broadcast_object(empty_buffer, src_rank=rank, group=self.group, dist_device=self._device)
def _free_other_grads(self) -> None:
"""Free all the gradients only useful for the other ranks
"""
for i, partition in enumerate(self.partition_parameters()):
if i == self.rank:
continue
for p in partition:
for t in p["params"]:
t.grad = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment