[bug]: fix a bug on custom smoothing factor (#401)

4396ef4a · Min Xu · GitHub · 535eb011 · 4396ef4a · 4396ef4a
Unverified Commit 4396ef4a authored Feb 18, 2021 by Min Xu Committed by GitHub Feb 18, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 0 deletions

fairscale/optim/adascale.py fairscale/optim/adascale.py +8 -0

tests/optim/test_single_node_adascale.py tests/optim/test_single_node_adascale.py +7 -0

No files found.
--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -122,6 +122,9 @@ class AdaScale(Optimizer):
        smoothing (float):
            Smoothing factor for moving average.
            If None, it defaults to ``max(1 - (world_size * num_gradients_to_accumulate)/1000, 0)``.
+            Note, for very high scale training, higher smoothing value might be needed,
+            esp at the begining of the training. Therefore, if your scale is close to or larger
+            than 1000, try experimenting with smoothing value > 0 if the final accuracy is poor.
        num_gradients_to_accumulate (int):
            Number of passes that we accumulate gradients locally
            between each optimizer step. This can be changed during
@@ -159,6 +162,11 @@ class AdaScale(Optimizer):

        self.set_num_gradients_to_accumulate(num_gradients_to_accumulate, update_smoothing=True)

+        # The previous function call sets smoothing to its default value.
+        # Override that here if smoothing was passed as an argument.
+        if smoothing is not None:
+            self._smoothing = smoothing
+
        if self._world_size * self._num_grads_to_accum <= 1:
            # gain will be NaN since we will be dividing by zero in paper's B.3 where (S-1) == 0.
            raise RuntimeError("AdaScale does not support a single worker without grad accumulation.")

--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -375,3 +375,10 @@ def test_unhook():
    del optim
    torch.cuda.empty_cache()
    assert not find_tensor_by_shape(target_shape), "tensor should have been released"
+
+
+def test_custom_smoothing_factor():
+    """Test custom smoothing since we had a bug around it."""
+    model = Linear(1, 1)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), smoothing=0.12345, num_gradients_to_accumulate=3)
+    assert optim._smoothing == 0.12345