[fix] more adascale gradient accumulation tests and smoothing factor fix (#235)

* better ddp adascale tests * make sure the single node test use the same test cases and expected gains * added unit test that covers smoothing factor - tested by re-introducing the bug and see the test fail as expected.

[fix] more adascale gradient accumulation tests and smoothing factor fix (#235)
* better ddp adascale tests * make sure the single node test use the same test cases and expected gains * added unit test that covers smoothing factor - tested by re-introducing the bug and see the test fail as expected.
f74afebb · Min Xu · GitHub · 2eef71b9 · f74afebb · f74afebb
Unverified Commit f74afebb authored Dec 14, 2020 by Min Xu Committed by GitHub Dec 14, 2020
3 changed files
--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -72,7 +72,8 @@ class AdaScale(object):
            larger batch size (summed across all world_size) means a scale of
            10. If None, defaults to ``world_size``.
        smoothing (float):
-            Smoothing factor between batches. Default value: 0.9999
+            Smoothing factor for moving average. If None, it defaults to
+            max(1 - (world_size * num_gradients_to_accumulate)/1000, 0).
        num_gradients_to_accumulate (int):
            Number of passes that we accumulate gradients locally.
            Default to 1, which does not accumulate gradients.
@@ -83,7 +84,7 @@ class AdaScale(object):
        optimizer: torch.optim.Optimizer,
        world_size: Optional[int] = None,
        scale: Optional[float] = None,
-        smoothing: float = 0.999,
+        smoothing: float = None,
        num_gradients_to_accumulate: int = 1,
    ):
        self._optimizer = optimizer
@@ -91,7 +92,6 @@ class AdaScale(object):
        self._world_size: int = (
            world_size if world_size is not None else dist.get_world_size() if dist.is_initialized() else 1
        )
-        self._smoothing = smoothing
        self._num_backward_calls = 0
        self._num_grads_to_accum = num_gradients_to_accumulate

@@ -110,6 +110,12 @@ class AdaScale(object):

        self.set_scale(self._world_size * self._num_grads_to_accum if scale is None else scale)

+        # Set smoothing based on effective world_size rather than scale here, since world_size
+        # determines the number of samples being averaged over at every update
+        self._smoothing = (
+            max(1 - (self._world_size * self._num_grads_to_accum) / 1000, 0) if smoothing is None else smoothing
+        )
+
        # Register the gradient hooks. Note, don't assume every param will generate
        # a gradient (i.e. triggering the hook) in every backward pass.
        for idx, param_group in enumerate(self._optimizer.param_groups):
@@ -295,7 +301,7 @@ class AdaScale(object):
        grad_sqr = total_grad_sqr - grad_var / S
        grad_var = np.maximum(grad_var, 1e-6)
        grad_sqr = np.maximum(grad_sqr, 0.0)
-        theta = self._smoothing ** S
+        theta = self._smoothing
        self._update_avg("grad_sqr_avg", grad_sqr, theta)
        self._update_avg("grad_var_avg", grad_var, theta)


--- a/tests/optim/test_ddp_adascale.py
+++ b/tests/optim/test_ddp_adascale.py
@@ -32,33 +32,59 @@ def _dist_init(rank, world_size, tempfile_name, backend):
    torch.cuda.set_device(rank)


-def _test_basic_func(rank, world_size, tempfile_name):
+def _test_basic_func(rank, world_size, tempfile_name, test_case):
    _dist_init(rank, world_size, tempfile_name, backend="nccl")  # Covers nccl

    model = Linear(2, 2, bias=False)
    model.to("cuda")
    model = DDP(model, device_ids=[rank])
    optim = AdaScale(SGD(model.parameters(), lr=0.1))
-    # iter 1
-    in_data = Tensor([0.0, 0.0])
-    in_data[rank] = 1.0
-    in_data = in_data.cuda()
-    out = model(in_data)
-    out.sum().backward()
-    assert np.allclose(optim.gain(), 2.0), optim.gain()
-    optim.step()
-    optim.zero_grad()
+    if "input" in test_case:
+        # single iter
+        in_data = Tensor(test_case["input"][rank])
+        in_data = in_data.cuda()
+        out = model(in_data)
+        out.sum().backward()
+        assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
+        optim.step()
+        optim.zero_grad()
+    else:
+        # multiple iters
+        for in_data in test_case["inputs"]:
+            in_data = Tensor(in_data[rank]).cuda()
+            out = model(in_data)
+            out.sum().backward()
+            optim.step()
+            optim.zero_grad()
+        assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()

    dist.destroy_process_group()


+# IMPORTANT: make sure these test_cases values are sync'ed with the non-DDP
+# test in test_single_node_adascale.py. This way, we make sure gradient accumulation
+# works exactly like that in DDP.
 @skip_if_single_gpu
-def test_basic():
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # "input" value is a list of input tensors for rank 0 and rank 1.
+        {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
+        {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
+        {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
+        {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
+        {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
+        # "inputs" to trigger multiple iteration tests, which make sure the
+        # smoothing factor calculation is also covered.
+        {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
+    ],
+)
+def test_basic(test_case):
    """Test adascale with DDP without gradient accumulation"""
    world_size = 2
    temp_file_name = tempfile.mkstemp()[1]

-    mp.spawn(_test_basic_func, args=(world_size, temp_file_name), nprocs=world_size, join=True)
+    mp.spawn(_test_basic_func, args=(world_size, temp_file_name, test_case), nprocs=world_size, join=True)


 def _test_grad_accum_func(rank, world_size, tempfile_name):

--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -57,39 +57,61 @@ def test_loss_accum_cpu():
    assert np.allclose(optim.gain(), 1.0), optim.gain()


-def test_grad_accum_cpu(cpu=True):
-    """Test the basic functionality on CPU with gradient accumulation without DDP"""
+# IMPORTANT: make sure these test_cases values are sync'ed with the DDP
+# test in test_ddp_adascale.py. This way, we make sure gradient accumulation
+# works exactly like that in DDP.
+@pytest.mark.parametrize("cpu", [True, False])
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # "input" value is a list of input tensors for micro-batch 0 and micro-batch 1.
+        {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
+        {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
+        {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
+        {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
+        {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
+        # "inputs" to trigger multiple iteration tests, which make sure the
+        # smoothing factor calculation is also covered.
+        {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
+    ],
+)
+def test_grad_accum(test_case, cpu):
+    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
    if not cpu:
+        if torch.cuda.device_count() < 1:
+            pytest.skip("1 GPU is required")
        model = model.cuda()
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
-    for expected_gain in [2.0, 2.0]:  # test 2 iterations catch more corner cases.
+    expected_gain = test_case["expected_gain"]
+    if "input" in test_case:
+        data = [test_case["input"]] * 2
+        gains = [expected_gain] * 2
+    else:
+        data = test_case["inputs"]
+        gains = [None, expected_gain]
+    for in_data, exp_gain in zip(data, gains):  # test 2 iterations catch more corner cases.
        # grad pass 1
-        in_data = Tensor([0.0, 1.0])
+        in_data_0 = Tensor(in_data[0])
        if not cpu:
-            in_data = in_data.cuda()
-        out = model(in_data)
+            in_data_0 = in_data_0.cuda()
+        out = model(in_data_0)
        out.sum().backward()
        # grad pass 2
-        in_data = Tensor([1.0, 0.0])
+        in_data_1 = Tensor(in_data[1])
        if not cpu:
-            in_data = in_data.cuda()
-        out = model(in_data)
+            in_data_1 = in_data_1.cuda()
+        out = model(in_data_1)
        out.sum().backward()
+        if exp_gain is not None:
+            assert np.allclose(optim.gain(), exp_gain), optim.gain()
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
-        assert np.allclose(optim.gain(), expected_gain), optim.gain()
        optim.step()
        optim.zero_grad()


-@skip_if_no_gpu
-def test_grad_accum_gpu():
-    """Test the basic functionality on GPU with gradient accumulation without DDP"""
-    test_grad_accum_cpu(cpu=False)
-
-
 @skip_if_no_gpu
 def test_state_checkpointing():
    """ Test state checkpointing on GPU since that's the common case.