[feat] sync adascale from internal repo, support add_param_group (#266)

* [feat] sync adascale from internal repo - tbd testing: tbd * Update argument document of __init__ * update documentation around set_num_gradients_to_accumulate * added checking code for proper API calling places * rename internal APIs to make them internal * updated changelog * added support for add_param_group and its unit test * added unit test for set_num_gradients_to_accumulate * added debias_ewma unit test * fixed test_set_num_gradients_to_accumulate (need zero_grad() call) * added missing zero_grad() to test_lr_scheduler * fixed test_add_param_group with respect to optim.zero_grad() * added test_gradient_value * added test_scale_not_equal_default for scale != world_size * grad_accum * added test_unhook() * removed print statements * fixed a typo * addressed Ben's comment

[feat] sync adascale from internal repo, support add_param_group (#266)
* [feat] sync adascale from internal repo - tbd testing: tbd * Update argument document of __init__ * update documentation around set_num_gradients_to_accumulate * added checking code for proper API calling places * rename internal APIs to make them internal * updated changelog * added support for add_param_group and its unit test * added unit test for set_num_gradients_to_accumulate * added debias_ewma unit test * fixed test_set_num_gradients_to_accumulate (need zero_grad() call) * added missing zero_grad() to test_lr_scheduler * fixed test_add_param_group with respect to optim.zero_grad() * added test_gradient_value * added test_scale_not_equal_default for scale != world_size * grad_accum * added test_unhook() * removed print statements * fixed a typo * addressed Ben's comment
3932a1f6 · Min Xu · GitHub · 84a3bdbe · 3932a1f6 · 3932a1f6
Unverified Commit 3932a1f6 authored Jan 04, 2021 by Min Xu Committed by GitHub Jan 04, 2021
Showing with 437 additions and 66 deletions

CHANGELOG.md CHANGELOG.md +5 -2

fairscale/optim/adascale.py fairscale/optim/adascale.py +223 -56

tests/optim/test_single_node_adascale.py tests/optim/test_single_node_adascale.py +209 -8

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [next rel] - TBD
 ### Added
- AdaScale: Added gradient accumulation feature (#202)
+- AdaScale:
- AdaScale: Added support of torch.lr_scheduler (#229)
+  . Added gradient accumulation feature (#202)
+  . Added support of torch.lr_scheduler (#229)
+  . Added support for add_param_groups (#266)
+  . Added support for scale != world_size (#266)
 ### Fixed
 - AdaScale: smoothing factor value fixed when using gradient accumulation (#235)

--- a/fairscale/optim/adascale.py
+++ b/fairscale/optim/adascale.py
--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -9,13 +9,14 @@
 """ Test AdaScale with a single node (1 CPU or 1 GPU). """
+import gc
 import tempfile
 import numpy as np
 import pytest
 import torch
 from torch import Tensor
-from torch.nn import Linear
+from torch.nn import Linear, Sequential
 from torch.optim import SGD
 from torch.optim.lr_scheduler import LambdaLR
@@ -41,7 +42,7 @@ def test_loss_accum_cpu():
    """
    model = Linear(2, 2, bias=False)
    # num_gradients_to_accumulate value doesn't matter in this negative test.
-    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=123)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
    # data 1
    in_data = Tensor([0.0, 1.0])
    loss = model(in_data).sum()
@@ -53,9 +54,9 @@ def test_loss_accum_cpu():
    loss += model(in_data).sum()
    # backward, but gradient is only produced once by the autograd engine.
    loss.backward()
-    # therefore, the gain will always be 1, which renders adascale as noop.
+    # The gain will always be 1, which renders adascale as noop.
-    optim.step()
    assert np.allclose(optim.gain(), 1.0), optim.gain()
+    # We don't call optim.step(), since it will detect that backward is not yet done.
 # IMPORTANT: make sure these test_cases values are sync'ed with the DDP
@@ -138,7 +139,6 @@ def test_state_checkpointing():
    # Run a bit.
    def run_a_bit(replay_data=None):
-        print("running")
        data = []
        replay_data_idx = 0
        for _ in range(6):  # run some steps
@@ -151,8 +151,6 @@ def test_state_checkpointing():
                    replay_data_idx += 1
                out = model(in_data)
                out.sum().backward()
-                # print(out.sum().item())
-                print(model.weight.grad)
                if i == accum_steps - 1:
                    optim.step()
                    optim.zero_grad()
@@ -188,7 +186,7 @@ def test_state_checkpointing():
 def test_lr_scheduler():
-    """Test AdaScale working with torch.optim.lr_scheduler """
+    """Test AdaScale working with torch.optim.lr_scheduler."""
    model = Linear(2, 2, bias=False)
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
    # We use 1, not 0.1 here since scheduler.step() is called here first.
@@ -201,8 +199,211 @@ def test_lr_scheduler():
                loss.backward()
            assert optim.gain() <= 3, optim.gain()
            optim.step()
+            optim.zero_grad()
            # asserting LR is right
            assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
        scheduler.step()
        # asserting LR is right
        assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]
+@skip_if_no_gpu
+@pytest.mark.parametrize("debias_ewma", [True, False])
+def test_add_param_group(debias_ewma):
+    """Test AdaScale supports add_param_group() API."""
+    model1 = Linear(2, 2, bias=True)
+    with torch.no_grad():
+        # make weights and bias deterministic, which is needed for
+        # multi-layer models. For them, adascale gain is affected by
+        # parameters from other layers.
+        model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
+        model1.bias.fill_(0.1)
+    optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma)
+    assert len(optim._hook_handles) == 2
+    model2 = Linear(2, 3, bias=True)
+    with torch.no_grad():
+        # make weights and bias deterministic
+        model2.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2))
+        model2.bias.fill_(0.2)
+    optim.add_param_group({"params": model2.parameters()})
+    assert len(optim._hook_handles) == 4
+    # make sure we can run the model.
+    model = Sequential(model1, model2).cuda()
+    in_data_0 = Tensor([1.0, 2.0]).cuda()
+    out = model(in_data_0)
+    out.sum().backward()
+    in_data_1 = Tensor([3.0, 4.0]).cuda()
+    out = model(in_data_1)
+    out.sum().backward()
+    # make sure the gains are right and we can step.
+    # since this is the first step, debias_ewma doesn't affect the value.
+    assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain()
+    assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0)
+    assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1)
+    optim.step()
+    optim.zero_grad()
+    # make sure we can add a PG again after stepping.
+    model3 = Linear(3, 4, bias=True)
+    with torch.no_grad():
+        # make weights and bias deterministic
+        model3.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3))
+        model3.bias.fill_(0.2)
+    optim.add_param_group({"params": model3.parameters()})
+    assert len(optim._hook_handles) == 6
+    # make sure we can run the model.
+    model = Sequential(model1, model2, model3).cuda()
+    in_data_0 = Tensor([1.0, 2.0]).cuda()
+    out = model(in_data_0)
+    out.sum().backward()
+    in_data_1 = Tensor([3.0, 4.0]).cuda()
+    out = model(in_data_1)
+    out.sum().backward()
+    # make sure gains are right and we can step.
+    # the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
+    assert np.allclose(optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain()
+    assert np.allclose(optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0)
+    assert np.allclose(optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1)
+    assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
+    optim.step()
+    optim.zero_grad()
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        {"new_accum": 3, "exp_gain": 1.2573902104603087},
+        {"new_accum": 6, "exp_gain": 1.0903738977361481},
+        {"new_accum": 9, "exp_gain": 1.0432658660558123},
+    ],
+)
+def test_set_num_gradients_to_accumulate(test_case):
+    """Test set_num_gradients_to_accumulate experimental feature."""
+    new_accum = test_case["new_accum"]
+    exp_gain = test_case["exp_gain"]
+    model = Linear(2, 2, bias=False)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
+    out = model(Tensor([0.0, 1.0]))
+    out.sum().backward()
+    out = model(Tensor([1.0, 0.0]))
+    out.sum().backward()
+    assert np.allclose(optim.gain(), 2.0)
+    optim.step()
+    optim.zero_grad()
+    optim.set_scale(float(new_accum))
+    optim.set_num_gradients_to_accumulate(new_accum)
+    for _ in range(new_accum):
+        out = model(Tensor([0.0, 1.0]))
+        out.sum().backward()
+    assert np.allclose(optim.gain(), exp_gain), optim.gain()
+    optim.step()
+    optim.zero_grad()
+def test_debias_ewma():
+    """Test debias_ewma experimental feature"""
+    model = Linear(2, 2, bias=False)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True)
+    for _ in range(4):
+        out = model(Tensor([0.0, 1.0]))
+        out.sum().backward()
+        out = model(Tensor([1.0, 0.0]))
+        out.sum().backward()
+        assert np.allclose(optim.gain(), 2.0), optim.gain()
+        optim.step()
+        optim.zero_grad()
+def test_gradient_value():
+    """Test that we don't mutate the gradients during backward"""
+    model = Linear(2, 2, bias=False)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
+    # fwd 1
+    out = model(Tensor([0.0, 1.0]))
+    out.sum().backward()
+    assert np.allclose(model.weight.grad.numpy(), [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad
+    # fwd 2, grad is accumulated
+    out = model(Tensor([0.0, 1.0]))
+    out.sum().backward()
+    assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
+    # assert gain and grad value before/after step/zero_grad
+    assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain()
+    optim.step()
+    assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
+    optim.zero_grad()
+    assert np.allclose(model.weight.grad.numpy(), [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        {"scale": None, "exp_gain": 4.0},  # default, baseline is single batch
+        {"scale": 4.0 / 3, "exp_gain": 4.0 / 3},  # baseline is grad_accum = 3
+        {"scale": 4.0 / 2, "exp_gain": 2.0},  # baseline is grad_accum = 2
+        {"scale": 4.0 / 1, "exp_gain": 4.0},  # baseline is single batch
+    ],
+)
+def test_scale_not_equal_default(test_case):
+    """Test gain value when scale doesn't equal world size * grad_accum"""
+    scale = test_case["scale"]
+    exp_gain = test_case["exp_gain"]
+    model = Linear(4, 2, bias=False)
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=4, scale=scale)
+    data = [
+        [1.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0],
+        [0.0, 0.0, 1.0, 0.0],
+        [0.0, 0.0, 0.0, 1.0],
+    ]
+    for i in range(4):
+        out = model(Tensor(data[i]))
+        out.sum().backward()
+    # Since the inputs are perfect orthogonal, the gain should be at the scale.
+    assert np.allclose(optim.gain(), exp_gain), optim.gain()
+@skip_if_no_gpu
+def test_unhook():
+    """Test unhook that frees the tensor from CUDA memory."""
+    model = Linear(123, 456, bias=False).cuda()  # unique shape so that it can be found
+    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
+    def find_tensor():
+        """ Find the weight tensor from the heap
+            Return True if found.
+        """
+        for obj in gc.get_objects():
+            try:
+                # Only need to check parameter type objects
+                if "torch.nn.parameter.Parameter" not in str(type(obj)):
+                    continue
+                if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
+                    if obj.shape == (456, 123):
+                        return True
+            except Exception as e:
+                pass
+        return False
+    torch.cuda.empty_cache()
+    assert find_tensor(), "something wrong with gc-based method to find the tensor"
+    optim.unhook()
+    del model
+    del optim
+    torch.cuda.empty_cache()
+    assert not find_tensor(), "tensor should have been released"