Unverified Commit 4396ef4a authored by Min Xu's avatar Min Xu Committed by GitHub
Browse files

[bug]: fix a bug on custom smoothing factor (#401)

parent 535eb011
...@@ -122,6 +122,9 @@ class AdaScale(Optimizer): ...@@ -122,6 +122,9 @@ class AdaScale(Optimizer):
smoothing (float): smoothing (float):
Smoothing factor for moving average. Smoothing factor for moving average.
If None, it defaults to ``max(1 - (world_size * num_gradients_to_accumulate)/1000, 0)``. If None, it defaults to ``max(1 - (world_size * num_gradients_to_accumulate)/1000, 0)``.
Note, for very high scale training, higher smoothing value might be needed,
esp at the begining of the training. Therefore, if your scale is close to or larger
than 1000, try experimenting with smoothing value > 0 if the final accuracy is poor.
num_gradients_to_accumulate (int): num_gradients_to_accumulate (int):
Number of passes that we accumulate gradients locally Number of passes that we accumulate gradients locally
between each optimizer step. This can be changed during between each optimizer step. This can be changed during
...@@ -159,6 +162,11 @@ class AdaScale(Optimizer): ...@@ -159,6 +162,11 @@ class AdaScale(Optimizer):
self.set_num_gradients_to_accumulate(num_gradients_to_accumulate, update_smoothing=True) self.set_num_gradients_to_accumulate(num_gradients_to_accumulate, update_smoothing=True)
# The previous function call sets smoothing to its default value.
# Override that here if smoothing was passed as an argument.
if smoothing is not None:
self._smoothing = smoothing
if self._world_size * self._num_grads_to_accum <= 1: if self._world_size * self._num_grads_to_accum <= 1:
# gain will be NaN since we will be dividing by zero in paper's B.3 where (S-1) == 0. # gain will be NaN since we will be dividing by zero in paper's B.3 where (S-1) == 0.
raise RuntimeError("AdaScale does not support a single worker without grad accumulation.") raise RuntimeError("AdaScale does not support a single worker without grad accumulation.")
......
...@@ -375,3 +375,10 @@ def test_unhook(): ...@@ -375,3 +375,10 @@ def test_unhook():
del optim del optim
torch.cuda.empty_cache() torch.cuda.empty_cache()
assert not find_tensor_by_shape(target_shape), "tensor should have been released" assert not find_tensor_by_shape(target_shape), "tensor should have been released"
def test_custom_smoothing_factor():
"""Test custom smoothing since we had a bug around it."""
model = Linear(1, 1)
optim = AdaScale(SGD(model.parameters(), lr=0.1), smoothing=0.12345, num_gradients_to_accumulate=3)
assert optim._smoothing == 0.12345
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment