Unverified Commit 3932a1f6 authored by Min Xu's avatar Min Xu Committed by GitHub
Browse files

[feat] sync adascale from internal repo, support add_param_group (#266)

* [feat] sync adascale from internal repo

- tbd

testing: tbd

* Update argument document of __init__

* update documentation around set_num_gradients_to_accumulate

* added checking code for proper API calling places

* rename internal APIs to make them internal

* updated changelog

* added support for add_param_group and its unit test

* added unit test for set_num_gradients_to_accumulate

* added debias_ewma unit test

* fixed test_set_num_gradients_to_accumulate (need zero_grad() call)

* added missing zero_grad() to test_lr_scheduler

* fixed test_add_param_group with respect to optim.zero_grad()

* added test_gradient_value

* added test_scale_not_equal_default for scale != world_size * grad_accum

* added test_unhook()

* removed print statements

* fixed a typo

* addressed Ben's comment
parent 84a3bdbe
...@@ -7,8 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -7,8 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [next rel] - TBD ## [next rel] - TBD
### Added ### Added
- AdaScale: Added gradient accumulation feature (#202) - AdaScale:
- AdaScale: Added support of torch.lr_scheduler (#229) . Added gradient accumulation feature (#202)
. Added support of torch.lr_scheduler (#229)
. Added support for add_param_groups (#266)
. Added support for scale != world_size (#266)
### Fixed ### Fixed
- AdaScale: smoothing factor value fixed when using gradient accumulation (#235) - AdaScale: smoothing factor value fixed when using gradient accumulation (#235)
......
This diff is collapsed.
...@@ -9,13 +9,14 @@ ...@@ -9,13 +9,14 @@
""" Test AdaScale with a single node (1 CPU or 1 GPU). """ """ Test AdaScale with a single node (1 CPU or 1 GPU). """
import gc
import tempfile import tempfile
import numpy as np import numpy as np
import pytest import pytest
import torch import torch
from torch import Tensor from torch import Tensor
from torch.nn import Linear from torch.nn import Linear, Sequential
from torch.optim import SGD from torch.optim import SGD
from torch.optim.lr_scheduler import LambdaLR from torch.optim.lr_scheduler import LambdaLR
...@@ -41,7 +42,7 @@ def test_loss_accum_cpu(): ...@@ -41,7 +42,7 @@ def test_loss_accum_cpu():
""" """
model = Linear(2, 2, bias=False) model = Linear(2, 2, bias=False)
# num_gradients_to_accumulate value doesn't matter in this negative test. # num_gradients_to_accumulate value doesn't matter in this negative test.
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=123) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
# data 1 # data 1
in_data = Tensor([0.0, 1.0]) in_data = Tensor([0.0, 1.0])
loss = model(in_data).sum() loss = model(in_data).sum()
...@@ -53,9 +54,9 @@ def test_loss_accum_cpu(): ...@@ -53,9 +54,9 @@ def test_loss_accum_cpu():
loss += model(in_data).sum() loss += model(in_data).sum()
# backward, but gradient is only produced once by the autograd engine. # backward, but gradient is only produced once by the autograd engine.
loss.backward() loss.backward()
# therefore, the gain will always be 1, which renders adascale as noop. # The gain will always be 1, which renders adascale as noop.
optim.step()
assert np.allclose(optim.gain(), 1.0), optim.gain() assert np.allclose(optim.gain(), 1.0), optim.gain()
# We don't call optim.step(), since it will detect that backward is not yet done.
# IMPORTANT: make sure these test_cases values are sync'ed with the DDP # IMPORTANT: make sure these test_cases values are sync'ed with the DDP
...@@ -138,7 +139,6 @@ def test_state_checkpointing(): ...@@ -138,7 +139,6 @@ def test_state_checkpointing():
# Run a bit. # Run a bit.
def run_a_bit(replay_data=None): def run_a_bit(replay_data=None):
print("running")
data = [] data = []
replay_data_idx = 0 replay_data_idx = 0
for _ in range(6): # run some steps for _ in range(6): # run some steps
...@@ -151,8 +151,6 @@ def test_state_checkpointing(): ...@@ -151,8 +151,6 @@ def test_state_checkpointing():
replay_data_idx += 1 replay_data_idx += 1
out = model(in_data) out = model(in_data)
out.sum().backward() out.sum().backward()
# print(out.sum().item())
print(model.weight.grad)
if i == accum_steps - 1: if i == accum_steps - 1:
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
...@@ -188,7 +186,7 @@ def test_state_checkpointing(): ...@@ -188,7 +186,7 @@ def test_state_checkpointing():
def test_lr_scheduler(): def test_lr_scheduler():
"""Test AdaScale working with torch.optim.lr_scheduler """ """Test AdaScale working with torch.optim.lr_scheduler."""
model = Linear(2, 2, bias=False) model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
# We use 1, not 0.1 here since scheduler.step() is called here first. # We use 1, not 0.1 here since scheduler.step() is called here first.
...@@ -201,8 +199,211 @@ def test_lr_scheduler(): ...@@ -201,8 +199,211 @@ def test_lr_scheduler():
loss.backward() loss.backward()
assert optim.gain() <= 3, optim.gain() assert optim.gain() <= 3, optim.gain()
optim.step() optim.step()
optim.zero_grad()
# asserting LR is right # asserting LR is right
assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"] assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** epoch), optim.param_groups[0]["lr"]
scheduler.step() scheduler.step()
# asserting LR is right # asserting LR is right
assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"] assert np.allclose(optim.param_groups[0]["lr"], 0.1 / 10 ** (epoch + 1)), optim.param_groups[0]["lr"]
@skip_if_no_gpu
@pytest.mark.parametrize("debias_ewma", [True, False])
def test_add_param_group(debias_ewma):
"""Test AdaScale supports add_param_group() API."""
model1 = Linear(2, 2, bias=True)
with torch.no_grad():
# make weights and bias deterministic, which is needed for
# multi-layer models. For them, adascale gain is affected by
# parameters from other layers.
model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
model1.bias.fill_(0.1)
optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma)
assert len(optim._hook_handles) == 2
model2 = Linear(2, 3, bias=True)
with torch.no_grad():
# make weights and bias deterministic
model2.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(3, 2))
model2.bias.fill_(0.2)
optim.add_param_group({"params": model2.parameters()})
assert len(optim._hook_handles) == 4
# make sure we can run the model.
model = Sequential(model1, model2).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0)
out.sum().backward()
in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1)
out.sum().backward()
# make sure the gains are right and we can step.
# since this is the first step, debias_ewma doesn't affect the value.
assert np.allclose(optim.gain(), 1.1440223454935758), optim.gain()
assert np.allclose(optim.gain(0), 1.1428571428571428), optim.gain(0)
assert np.allclose(optim.gain(1), 1.1471258476157762), optim.gain(1)
optim.step()
optim.zero_grad()
# make sure we can add a PG again after stepping.
model3 = Linear(3, 4, bias=True)
with torch.no_grad():
# make weights and bias deterministic
model3.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0] * 2).reshape(4, 3))
model3.bias.fill_(0.2)
optim.add_param_group({"params": model3.parameters()})
assert len(optim._hook_handles) == 6
# make sure we can run the model.
model = Sequential(model1, model2, model3).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0)
out.sum().backward()
in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1)
out.sum().backward()
# make sure gains are right and we can step.
# the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
assert np.allclose(optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain()
assert np.allclose(optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0)
assert np.allclose(optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1)
assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
optim.step()
optim.zero_grad()
@pytest.mark.parametrize(
"test_case",
[
{"new_accum": 3, "exp_gain": 1.2573902104603087},
{"new_accum": 6, "exp_gain": 1.0903738977361481},
{"new_accum": 9, "exp_gain": 1.0432658660558123},
],
)
def test_set_num_gradients_to_accumulate(test_case):
"""Test set_num_gradients_to_accumulate experimental feature."""
new_accum = test_case["new_accum"]
exp_gain = test_case["exp_gain"]
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
out = model(Tensor([1.0, 0.0]))
out.sum().backward()
assert np.allclose(optim.gain(), 2.0)
optim.step()
optim.zero_grad()
optim.set_scale(float(new_accum))
optim.set_num_gradients_to_accumulate(new_accum)
for _ in range(new_accum):
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
assert np.allclose(optim.gain(), exp_gain), optim.gain()
optim.step()
optim.zero_grad()
def test_debias_ewma():
"""Test debias_ewma experimental feature"""
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=True)
for _ in range(4):
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
out = model(Tensor([1.0, 0.0]))
out.sum().backward()
assert np.allclose(optim.gain(), 2.0), optim.gain()
optim.step()
optim.zero_grad()
def test_gradient_value():
"""Test that we don't mutate the gradients during backward"""
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
# fwd 1
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
assert np.allclose(model.weight.grad.numpy(), [[0.0, 1.0], [0.0, 1.0]]), model.weight.grad
# fwd 2, grad is accumulated
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
# assert gain and grad value before/after step/zero_grad
assert np.allclose(optim.gain(), 1.0000002499999376), optim.gain()
optim.step()
assert np.allclose(model.weight.grad.numpy(), [[0.0, 2.0], [0.0, 2.0]]), model.weight.grad
optim.zero_grad()
assert np.allclose(model.weight.grad.numpy(), [[0.0, 0.0], [0.0, 0.0]]), model.weight.grad
@pytest.mark.parametrize(
"test_case",
[
{"scale": None, "exp_gain": 4.0}, # default, baseline is single batch
{"scale": 4.0 / 3, "exp_gain": 4.0 / 3}, # baseline is grad_accum = 3
{"scale": 4.0 / 2, "exp_gain": 2.0}, # baseline is grad_accum = 2
{"scale": 4.0 / 1, "exp_gain": 4.0}, # baseline is single batch
],
)
def test_scale_not_equal_default(test_case):
"""Test gain value when scale doesn't equal world size * grad_accum"""
scale = test_case["scale"]
exp_gain = test_case["exp_gain"]
model = Linear(4, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=4, scale=scale)
data = [
[1.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.0],
[0.0, 0.0, 0.0, 1.0],
]
for i in range(4):
out = model(Tensor(data[i]))
out.sum().backward()
# Since the inputs are perfect orthogonal, the gain should be at the scale.
assert np.allclose(optim.gain(), exp_gain), optim.gain()
@skip_if_no_gpu
def test_unhook():
"""Test unhook that frees the tensor from CUDA memory."""
model = Linear(123, 456, bias=False).cuda() # unique shape so that it can be found
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
def find_tensor():
""" Find the weight tensor from the heap
Return True if found.
"""
for obj in gc.get_objects():
try:
# Only need to check parameter type objects
if "torch.nn.parameter.Parameter" not in str(type(obj)):
continue
if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
if obj.shape == (456, 123):
return True
except Exception as e:
pass
return False
torch.cuda.empty_cache()
assert find_tensor(), "something wrong with gc-based method to find the tensor"
optim.unhook()
del model
del optim
torch.cuda.empty_cache()
assert not find_tensor(), "tensor should have been released"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment