Fix gradient accumulation (#1086)

* Fix gradient accumulation - Add ``is_scaled_loss`` flag to support both scaled / unscaled loss - Add a method `scale_grad_by_num_grads_to_accum`to handle gradient accumulation using unscaled loss more explicitly - Fix ``test_grad_accum`` and``test_set_num_gradients_to_accumulate`` - Add tests for gradient

Fix gradient accumulation (#1086)
* Fix gradient accumulation - Add ``is_scaled_loss`` flag to support both scaled / unscaled loss - Add a method `scale_grad_by_num_grads_to_accum`to handle gradient accumulation using unscaled loss more explicitly - Fix ``test_grad_accum`` and``test_set_num_gradients_to_accumulate`` - Add tests for gradient
f5e727cc · Changyu Gao · GitHub · b0b92e70 · f5e727cc · f5e727cc
Unverified Commit f5e727cc authored Oct 05, 2022 by Changyu Gao Committed by GitHub Oct 05, 2022
4 changed files
--- a/fairscale/fair_dev/testing/golden_testing_data.py
+++ b/fairscale/fair_dev/testing/golden_testing_data.py
@@ -8,12 +8,42 @@
 adascale_test_data = [
    # "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
-    {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 4.0 / 3},
+    {
-    {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
+        "input": [[1.0, 0], [0, 1.0]],
-    {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
+        "expected_gain": 4.0 / 3,
-    {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.4688796680497926},
+        "expected_grad": [[0.5, 0.5], [0.5, 0.5]],
-    {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.8472893901708},
+        "expected_bias_grad": [1.0, 1.0],
+    },
+    {
+        "input": [[1.0, 1.0], [1.0, 1.0]],
+        "expected_gain": 1.0000001249999846,
+        "expected_grad": [[1.0, 1.0], [1.0, 1.0]],
+        "expected_bias_grad": [1.0, 1.0],
+    },
+    {
+        "input": [[-1.0, 1.0], [1.0, -1.0]],
+        "expected_gain": 2.0,
+        "expected_grad": [[0.0, 0.0], [0.0, 0.0]],
+        "expected_bias_grad": [1.0, 1.0],
+    },
+    {
+        "input": [[1.0, 4.0], [5.0, 0.5]],
+        "expected_gain": 1.4688796680497926,
+        "expected_grad": [[3.0, 2.25], [3.0, 2.25]],
+        "expected_bias_grad": [1.0, 1.0],
+    },
+    {
+        "input": [[-0.2, 3.0], [5.0, 0.5]],
+        "expected_gain": 1.8472893901708,
+        "expected_grad": [[2.4000000953674316, 1.75], [2.4000000953674316, 1.75]],
+        "expected_bias_grad": [1.0, 1.0],
+    },
    # "inputs" to trigger multiple iteration tests, which make sure the
    # smoothing factor calculation is also covered.
-    {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.6720968158031417},
+    {
+        "inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]],
+        "expected_gain": 1.6720968158031417,
+        "expected_grad": [[2.049999952316284, 2.049999952316284], [2.049999952316284, 2.049999952316284]],
+        "expected_bias_grad": [1.0, 1.0],
+    },
 ]
--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
@@ -130,7 +130,12 @@ class AdaScale(Optimizer):
            between each optimizer step. This can be changed during
            training as long as the train loop changes gradient accumulation
            accordingly.
+            The loss in each pass can be either scaled or unscaled. See `is_scaled_loss` below.
            Default to 1, which does not accumulate gradients.
+        is_scaled_loss (bool):
+            If True, assume that the loss is scaled by `num_gradients_to_accumulate`.
+            If False, the loss is not scaled.
+            Default: True.
        debias_ewma (bool):
            (experimental) Use debias exponential moving average
            for smoothing and mu and sigma variables. False will
@@ -145,6 +150,7 @@ class AdaScale(Optimizer):
        scale: Optional[float] = None,
        smoothing: float = None,
        num_gradients_to_accumulate: int = 1,
+        is_scaled_loss: bool = True,
        debias_ewma: bool = True,
    ):
        # Init hook_handles list, otherwise, a partial init'ed object may fail in ``__del__``.
@@ -160,6 +166,7 @@ class AdaScale(Optimizer):
        self._last_final_backward_call = 0
        self._num_grads_to_accum = num_gradients_to_accumulate
        self._debias_ewma = debias_ewma
+        self._is_scaled_loss = is_scaled_loss
        # Proxy the param_groups so that `torch.optim.lr_scheduler` can work.
        self.param_groups = self._optimizer.param_groups
@@ -453,17 +460,22 @@ class AdaScale(Optimizer):
        total_grad_sqr = np.array(
            [sum(param.grad.pow(2).sum().item() for param in group["params"]) for group in self._optimizer.param_groups]
        )
-        # Divide by (_num_grads_to_accum ** 2) to account for gradient
-        # accumulation.
-        if self._num_grads_to_accum > 1:
-            # np array doesn't support /=.
-            total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
        # Wait for all_reduce to be done and move it to cpu & np.
        if work:
            work.wait()
        local_grad_sqr = self._local_grad_sqr.cpu().numpy()
+        if self._num_grads_to_accum > 1:
+            # Handle scaling for for gradient accumulation
+            if self._is_scaled_loss:
+                # If loss is scaled down, we need to scale the local gradients back by a factor of _num_grads_to_accum squared;
+                # total_grad_sqr is already scaled by _num_grads_to_accum squared.
+                local_grad_sqr *= self._num_grads_to_accum**2
+            else:
+                # If loss is not scaled, local gradients are correct, but we need to scale the total_grad_sqr down to account for gradient accumulation.
+                total_grad_sqr /= self._num_grads_to_accum**2
        # See appendix B.3 of the paper.
        # Modified to handle cases where scale != world_size
        #
@@ -509,7 +521,7 @@ class AdaScale(Optimizer):
        original_lr = []
        for idx, param_group in enumerate(self._optimizer.param_groups):
            original_lr.append(param_group["lr"])
-            param_group["lr"] = self.gain(pg_idx=idx) * param_group["lr"]
+            param_group["lr"] *= self.gain(pg_idx=idx)
        # Step it.
        res = self._optimizer.step(*args, **kwargs)
@@ -606,6 +618,18 @@ class AdaScale(Optimizer):
            # not needed, so the smoothing factor is 0.
            self._smoothing = max(1 - self._world_size * self._num_grads_to_accum / 1000, 0)
+    def scale_grad_by_num_grads_to_accum(self) -> None:
+        """Scale the gradient down by the number of gradients to accumulate.
+        This should be called after the gradient accumulation is done and the unscaled loss is used.
+        """
+        assert self._local_grad_sqr is None, "Only call this after backward"
+        assert self._num_grads_to_accum > 1, "Must be accumulating gradients"
+        assert not self._is_scaled_loss, "Must use unscaled loss"
+        for group in self._optimizer.param_groups:
+            for param in group["params"]:
+                param.grad.div_(self._num_grads_to_accum)
    def __getattr__(self, name: str) -> Any:
        """Forward missing attributes to wrapped optimizer."""
        try:

--- a/tests/optim/test_ddp_adascale.py
+++ b/tests/optim/test_ddp_adascale.py
@@ -75,18 +75,25 @@ def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
        out.sum().backward()
        if ddp_cls is DDP:
            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
+            w, b = model.parameters()
+            assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
+            assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
        optim.step()
        optim.zero_grad()
    else:
        # multiple iters
-        for in_data in test_case["inputs"]:
+        n = len(test_case["inputs"])
+        for i, in_data in enumerate(test_case["inputs"]):
            in_data = Tensor(in_data[rank]).cuda()
            out = model(in_data)
            out.sum().backward()
+            if i == n - 1 and ddp_cls is DDP:
+                assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
+                w, b = model.parameters()
+                assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
+                assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
            optim.step()
            optim.zero_grad()
-        if ddp_cls is DDP:
-            assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
    dist.destroy_process_group()

--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -61,7 +61,8 @@ def test_loss_accum_cpu():
 @pytest.mark.parametrize("cpu", [True, False])
 @pytest.mark.parametrize("test_case", adascale_test_data)
-def test_grad_accum(test_case, cpu):
+@pytest.mark.parametrize("is_scaled_loss", [True, False])
+def test_grad_accum(test_case, cpu, is_scaled_loss):
    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    make_cudnn_deterministic()
    model = Linear(2, 2, bias=True)
@@ -69,7 +70,7 @@ def test_grad_accum(test_case, cpu):
        if torch.cuda.device_count() < 1:
            pytest.skip("1 GPU is required")
        model = model.cuda()
-    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
    expected_gain = test_case["expected_gain"]
    if "input" in test_case:
        data = [test_case["input"]] * 2
@@ -82,16 +83,27 @@ def test_grad_accum(test_case, cpu):
        in_data_0 = Tensor(in_data[0])
        if not cpu:
            in_data_0 = in_data_0.cuda()
-        out = model(in_data_0)
+        loss = model(in_data_0).sum()
-        out.sum().backward()
+        if is_scaled_loss:
+            loss = loss / 2
+        loss.backward()
        # grad pass 2
        in_data_1 = Tensor(in_data[1])
        if not cpu:
            in_data_1 = in_data_1.cuda()
-        out = model(in_data_1)
+        loss = model(in_data_1).sum()
-        out.sum().backward()
+        if is_scaled_loss:
+            loss = loss / 2
+        loss.backward()
+        if not is_scaled_loss:
+            optim.scale_grad_by_num_grads_to_accum()
        if exp_gain is not None:
            assert np.allclose(optim.gain(), exp_gain), optim.gain()
+            w, b = model.parameters()
+            assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
+            assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
        # stepping it. Note that if we did more than 2 passes as promised by the
        # num_gradients_to_accumulate argument above, AdaScale is not be able to
        # detect that mistake for now. The result will just be wrong in that case.
@@ -110,14 +122,14 @@ def test_state_checkpointing():
    a unit test for checkpointing with DDP.
    """
    # Constants.
-    accum_steps = 3
+    num_grads_to_accum = 3
    in_dim = 5
    # Setup.
    def make_model_and_optim():
        model = Linear(in_dim, 2, bias=False)
        model = model.cuda()
-        optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=accum_steps)
+        optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=num_grads_to_accum)
        return model, optim
    model, optim = make_model_and_optim()
@@ -127,7 +139,7 @@ def test_state_checkpointing():
        data = []
        replay_data_idx = 0
        for _ in range(6):  # run some steps
-            for i in range(accum_steps):
+            for i in range(num_grads_to_accum):
                if replay_data is None:
                    in_data = torch.rand(in_dim).cuda()
                    data.append(in_data)
@@ -136,7 +148,7 @@ def test_state_checkpointing():
                    replay_data_idx += 1
                out = model(in_data)
                out.sum().backward()
-                if i == accum_steps - 1:
+                if i == num_grads_to_accum - 1:
                    optim.step()
                    optim.zero_grad()
        return out, data
@@ -172,13 +184,14 @@ def test_state_checkpointing():
 def test_lr_scheduler():
    """Test AdaScale working with torch.optim.lr_scheduler."""
+    num_grads_to_accum = 3
    model = Linear(2, 2, bias=False)
-    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=num_grads_to_accum)
    # We use 1, not 0.1 here since scheduler.step() is called here first.
    scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch)
    for epoch in range(3):
        for data_idx in range(10):
-            for accumulation in range(3):
+            for accumulation in range(num_grads_to_accum):
                in_data = torch.rand(2)
                loss = model(in_data).sum()
                loss.backward()
@@ -194,8 +207,10 @@ def test_lr_scheduler():
 @skip_if_no_cuda
 @pytest.mark.parametrize("debias_ewma", [True, False])
-def test_add_param_group(debias_ewma):
+@pytest.mark.parametrize("is_scaled_loss", [True, False])
-    """Test AdaScale supports add_param_group() API."""
+def test_add_param_group(debias_ewma, is_scaled_loss):
+    """Test AdaScale supports add_param_group() API for both scaled and unscaled loss."""
+    num_grads_to_accum = 2
    model1 = Linear(2, 2, bias=True)
    with torch.no_grad():
        # make weights and bias deterministic, which is needed for
@@ -203,7 +218,12 @@ def test_add_param_group(debias_ewma):
        # parameters from other layers.
        model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
        model1.bias.fill_(0.1)
-    optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma)
+    optim = AdaScale(
+        SGD(model1.parameters(), lr=0.1),
+        num_gradients_to_accumulate=2,
+        is_scaled_loss=is_scaled_loss,
+        debias_ewma=debias_ewma,
+    )
    assert len(optim._hook_handles) == 2, len(optim._hook_handles)
    model2 = Linear(2, 3, bias=True)
@@ -217,12 +237,19 @@ def test_add_param_group(debias_ewma):
    # make sure we can run the model.
    model = Sequential(model1, model2).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
-    out = model(in_data_0)
+    loss = model(in_data_0).sum()
-    out.sum().backward()
+    if is_scaled_loss:
+        loss = loss / num_grads_to_accum
+    loss.backward()
    in_data_1 = Tensor([3.0, 4.0]).cuda()
-    out = model(in_data_1)
+    loss = model(in_data_1).sum()
-    out.sum().backward()
+    if is_scaled_loss:
+        loss = loss / num_grads_to_accum
+    loss.backward()
+    if not is_scaled_loss:
+        optim.scale_grad_by_num_grads_to_accum()
    # make sure the gains are right and we can step.
    # since this is the first step, debias_ewma doesn't affect the value.
@@ -244,19 +271,26 @@ def test_add_param_group(debias_ewma):
    # make sure we can run the model.
    model = Sequential(model1, model2, model3).cuda()
    in_data_0 = Tensor([1.0, 2.0]).cuda()
-    out = model(in_data_0)
+    loss = model(in_data_0).sum()
-    out.sum().backward()
+    if is_scaled_loss:
+        loss = loss / num_grads_to_accum
+    loss.backward()
    in_data_1 = Tensor([3.0, 4.0]).cuda()
-    out = model(in_data_1)
+    loss = model(in_data_1).sum()
-    out.sum().backward()
+    if is_scaled_loss:
+        loss = loss / num_grads_to_accum
+    loss.backward()
+    if not is_scaled_loss:
+        optim.scale_grad_by_num_grads_to_accum()
    # make sure gains are right and we can step.
    # the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
-    assert np.allclose(optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain()
+    assert np.allclose(optim.gain(), 1.1382937715383077 if debias_ewma else 1.1391959826562015), optim.gain()
-    assert np.allclose(optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0)
+    assert np.allclose(optim.gain(0), 1.142857206008338 if debias_ewma else 1.142857206006931), optim.gain(0)
-    assert np.allclose(optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1)
+    assert np.allclose(optim.gain(1), 1.1116875516387468 if debias_ewma else 1.1116906378271827), optim.gain(1)
-    assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
+    assert np.allclose(optim.gain(2), 1.0749164095196344), optim.gain(2)
    optim.step()
    optim.zero_grad()
@@ -264,31 +298,45 @@ def test_add_param_group(debias_ewma):
 @pytest.mark.parametrize(
    "test_case",
    [
-        {"new_accum": 3, "exp_gain": 1.2573902104603087},
+        {"num_grads_to_accum": 3, "exp_gain": 2.141385737279438},
-        {"new_accum": 6, "exp_gain": 1.0903738977361481},
+        {"num_grads_to_accum": 6, "exp_gain": 2.9927880097754036},
-        {"new_accum": 9, "exp_gain": 1.0432658660558123},
+        {"num_grads_to_accum": 9, "exp_gain": 3.4461759591877312},
    ],
 )
-def test_set_num_gradients_to_accumulate(test_case):
+@pytest.mark.parametrize("is_scaled_loss", [True, False])
+def test_set_num_gradients_to_accumulate(test_case, is_scaled_loss):
    """Test set_num_gradients_to_accumulate experimental feature."""
-    new_accum = test_case["new_accum"]
+    num_grads_to_accum = test_case["num_grads_to_accum"]
    exp_gain = test_case["exp_gain"]
    model = Linear(2, 2, bias=False)
-    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
-    out = model(Tensor([0.0, 1.0]))
+    loss = model(Tensor([0.0, 1.0])).sum()
-    out.sum().backward()
+    if is_scaled_loss:
-    out = model(Tensor([1.0, 0.0]))
+        loss = loss / 2
-    out.sum().backward()
+    loss.backward()
+    loss = model(Tensor([1.0, 0.0])).sum()
+    if is_scaled_loss:
+        loss = loss / 2
+    loss.backward()
+    if not is_scaled_loss:
+        optim.scale_grad_by_num_grads_to_accum()
    assert np.allclose(optim.gain(), 2.0)
    optim.step()
    optim.zero_grad()
-    optim.set_scale(float(new_accum))
+    optim.set_scale(float(num_grads_to_accum))
-    optim.set_num_gradients_to_accumulate(new_accum)
+    optim.set_num_gradients_to_accumulate(num_grads_to_accum)
-    for _ in range(new_accum):
+    for _ in range(num_grads_to_accum):
-        out = model(Tensor([0.0, 1.0]))
+        loss = model(Tensor([0.0, 1.0])).sum() / num_grads_to_accum
-        out.sum().backward()
+        if is_scaled_loss:
+            loss = loss / num_grads_to_accum
+        loss.backward()
+    if not is_scaled_loss:
+        optim.scale_grad_by_num_grads_to_accum()
    assert np.allclose(optim.gain(), exp_gain), optim.gain()
    optim.step()