Unverified Commit f5e727cc authored by Changyu Gao's avatar Changyu Gao Committed by GitHub
Browse files

Fix gradient accumulation (#1086)

* Fix gradient accumulation

- Add ``is_scaled_loss`` flag to support both scaled / unscaled loss
- Add a method `scale_grad_by_num_grads_to_accum`to handle gradient accumulation using unscaled loss more explicitly
- Fix ``test_grad_accum`` and``test_set_num_gradients_to_accumulate``
- Add tests for gradient
parent b0b92e70
......@@ -8,12 +8,42 @@
adascale_test_data = [
# "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 4.0 / 3},
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.4688796680497926},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.8472893901708},
{
"input": [[1.0, 0], [0, 1.0]],
"expected_gain": 4.0 / 3,
"expected_grad": [[0.5, 0.5], [0.5, 0.5]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[1.0, 1.0], [1.0, 1.0]],
"expected_gain": 1.0000001249999846,
"expected_grad": [[1.0, 1.0], [1.0, 1.0]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[-1.0, 1.0], [1.0, -1.0]],
"expected_gain": 2.0,
"expected_grad": [[0.0, 0.0], [0.0, 0.0]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[1.0, 4.0], [5.0, 0.5]],
"expected_gain": 1.4688796680497926,
"expected_grad": [[3.0, 2.25], [3.0, 2.25]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[-0.2, 3.0], [5.0, 0.5]],
"expected_gain": 1.8472893901708,
"expected_grad": [[2.4000000953674316, 1.75], [2.4000000953674316, 1.75]],
"expected_bias_grad": [1.0, 1.0],
},
# "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.6720968158031417},
{
"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]],
"expected_gain": 1.6720968158031417,
"expected_grad": [[2.049999952316284, 2.049999952316284], [2.049999952316284, 2.049999952316284]],
"expected_bias_grad": [1.0, 1.0],
},
]
......@@ -130,7 +130,12 @@ class AdaScale(Optimizer):
between each optimizer step. This can be changed during
training as long as the train loop changes gradient accumulation
accordingly.
The loss in each pass can be either scaled or unscaled. See `is_scaled_loss` below.
Default to 1, which does not accumulate gradients.
is_scaled_loss (bool):
If True, assume that the loss is scaled by `num_gradients_to_accumulate`.
If False, the loss is not scaled.
Default: True.
debias_ewma (bool):
(experimental) Use debias exponential moving average
for smoothing and mu and sigma variables. False will
......@@ -145,6 +150,7 @@ class AdaScale(Optimizer):
scale: Optional[float] = None,
smoothing: float = None,
num_gradients_to_accumulate: int = 1,
is_scaled_loss: bool = True,
debias_ewma: bool = True,
):
# Init hook_handles list, otherwise, a partial init'ed object may fail in ``__del__``.
......@@ -160,6 +166,7 @@ class AdaScale(Optimizer):
self._last_final_backward_call = 0
self._num_grads_to_accum = num_gradients_to_accumulate
self._debias_ewma = debias_ewma
self._is_scaled_loss = is_scaled_loss
# Proxy the param_groups so that `torch.optim.lr_scheduler` can work.
self.param_groups = self._optimizer.param_groups
......@@ -453,17 +460,22 @@ class AdaScale(Optimizer):
total_grad_sqr = np.array(
[sum(param.grad.pow(2).sum().item() for param in group["params"]) for group in self._optimizer.param_groups]
)
# Divide by (_num_grads_to_accum ** 2) to account for gradient
# accumulation.
if self._num_grads_to_accum > 1:
# np array doesn't support /=.
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
# Wait for all_reduce to be done and move it to cpu & np.
if work:
work.wait()
local_grad_sqr = self._local_grad_sqr.cpu().numpy()
if self._num_grads_to_accum > 1:
# Handle scaling for for gradient accumulation
if self._is_scaled_loss:
# If loss is scaled down, we need to scale the local gradients back by a factor of _num_grads_to_accum squared;
# total_grad_sqr is already scaled by _num_grads_to_accum squared.
local_grad_sqr *= self._num_grads_to_accum**2
else:
# If loss is not scaled, local gradients are correct, but we need to scale the total_grad_sqr down to account for gradient accumulation.
total_grad_sqr /= self._num_grads_to_accum**2
# See appendix B.3 of the paper.
# Modified to handle cases where scale != world_size
#
......@@ -509,7 +521,7 @@ class AdaScale(Optimizer):
original_lr = []
for idx, param_group in enumerate(self._optimizer.param_groups):
original_lr.append(param_group["lr"])
param_group["lr"] = self.gain(pg_idx=idx) * param_group["lr"]
param_group["lr"] *= self.gain(pg_idx=idx)
# Step it.
res = self._optimizer.step(*args, **kwargs)
......@@ -606,6 +618,18 @@ class AdaScale(Optimizer):
# not needed, so the smoothing factor is 0.
self._smoothing = max(1 - self._world_size * self._num_grads_to_accum / 1000, 0)
def scale_grad_by_num_grads_to_accum(self) -> None:
"""Scale the gradient down by the number of gradients to accumulate.
This should be called after the gradient accumulation is done and the unscaled loss is used.
"""
assert self._local_grad_sqr is None, "Only call this after backward"
assert self._num_grads_to_accum > 1, "Must be accumulating gradients"
assert not self._is_scaled_loss, "Must use unscaled loss"
for group in self._optimizer.param_groups:
for param in group["params"]:
param.grad.div_(self._num_grads_to_accum)
def __getattr__(self, name: str) -> Any:
"""Forward missing attributes to wrapped optimizer."""
try:
......
......@@ -75,18 +75,25 @@ def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
out.sum().backward()
if ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
optim.step()
optim.zero_grad()
else:
# multiple iters
for in_data in test_case["inputs"]:
n = len(test_case["inputs"])
for i, in_data in enumerate(test_case["inputs"]):
in_data = Tensor(in_data[rank]).cuda()
out = model(in_data)
out.sum().backward()
if i == n - 1 and ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
optim.step()
optim.zero_grad()
if ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
dist.destroy_process_group()
......
......@@ -61,7 +61,8 @@ def test_loss_accum_cpu():
@pytest.mark.parametrize("cpu", [True, False])
@pytest.mark.parametrize("test_case", adascale_test_data)
def test_grad_accum(test_case, cpu):
@pytest.mark.parametrize("is_scaled_loss", [True, False])
def test_grad_accum(test_case, cpu, is_scaled_loss):
"""Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
make_cudnn_deterministic()
model = Linear(2, 2, bias=True)
......@@ -69,7 +70,7 @@ def test_grad_accum(test_case, cpu):
if torch.cuda.device_count() < 1:
pytest.skip("1 GPU is required")
model = model.cuda()
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
expected_gain = test_case["expected_gain"]
if "input" in test_case:
data = [test_case["input"]] * 2
......@@ -82,16 +83,27 @@ def test_grad_accum(test_case, cpu):
in_data_0 = Tensor(in_data[0])
if not cpu:
in_data_0 = in_data_0.cuda()
out = model(in_data_0)
out.sum().backward()
loss = model(in_data_0).sum()
if is_scaled_loss:
loss = loss / 2
loss.backward()
# grad pass 2
in_data_1 = Tensor(in_data[1])
if not cpu:
in_data_1 = in_data_1.cuda()
out = model(in_data_1)
out.sum().backward()
loss = model(in_data_1).sum()
if is_scaled_loss:
loss = loss / 2
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
if exp_gain is not None:
assert np.allclose(optim.gain(), exp_gain), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
# stepping it. Note that if we did more than 2 passes as promised by the
# num_gradients_to_accumulate argument above, AdaScale is not be able to
# detect that mistake for now. The result will just be wrong in that case.
......@@ -110,14 +122,14 @@ def test_state_checkpointing():
a unit test for checkpointing with DDP.
"""
# Constants.
accum_steps = 3
num_grads_to_accum = 3
in_dim = 5
# Setup.
def make_model_and_optim():
model = Linear(in_dim, 2, bias=False)
model = model.cuda()
optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=accum_steps)
optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=num_grads_to_accum)
return model, optim
model, optim = make_model_and_optim()
......@@ -127,7 +139,7 @@ def test_state_checkpointing():
data = []
replay_data_idx = 0
for _ in range(6): # run some steps
for i in range(accum_steps):
for i in range(num_grads_to_accum):
if replay_data is None:
in_data = torch.rand(in_dim).cuda()
data.append(in_data)
......@@ -136,7 +148,7 @@ def test_state_checkpointing():
replay_data_idx += 1
out = model(in_data)
out.sum().backward()
if i == accum_steps - 1:
if i == num_grads_to_accum - 1:
optim.step()
optim.zero_grad()
return out, data
......@@ -172,13 +184,14 @@ def test_state_checkpointing():
def test_lr_scheduler():
"""Test AdaScale working with torch.optim.lr_scheduler."""
num_grads_to_accum = 3
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=num_grads_to_accum)
# We use 1, not 0.1 here since scheduler.step() is called here first.
scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch)
for epoch in range(3):
for data_idx in range(10):
for accumulation in range(3):
for accumulation in range(num_grads_to_accum):
in_data = torch.rand(2)
loss = model(in_data).sum()
loss.backward()
......@@ -194,8 +207,10 @@ def test_lr_scheduler():
@skip_if_no_cuda
@pytest.mark.parametrize("debias_ewma", [True, False])
def test_add_param_group(debias_ewma):
"""Test AdaScale supports add_param_group() API."""
@pytest.mark.parametrize("is_scaled_loss", [True, False])
def test_add_param_group(debias_ewma, is_scaled_loss):
"""Test AdaScale supports add_param_group() API for both scaled and unscaled loss."""
num_grads_to_accum = 2
model1 = Linear(2, 2, bias=True)
with torch.no_grad():
# make weights and bias deterministic, which is needed for
......@@ -203,7 +218,12 @@ def test_add_param_group(debias_ewma):
# parameters from other layers.
model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
model1.bias.fill_(0.1)
optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma)
optim = AdaScale(
SGD(model1.parameters(), lr=0.1),
num_gradients_to_accumulate=2,
is_scaled_loss=is_scaled_loss,
debias_ewma=debias_ewma,
)
assert len(optim._hook_handles) == 2, len(optim._hook_handles)
model2 = Linear(2, 3, bias=True)
......@@ -217,12 +237,19 @@ def test_add_param_group(debias_ewma):
# make sure we can run the model.
model = Sequential(model1, model2).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0)
out.sum().backward()
loss = model(in_data_0).sum()
if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1)
out.sum().backward()
loss = model(in_data_1).sum()
if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
# make sure the gains are right and we can step.
# since this is the first step, debias_ewma doesn't affect the value.
......@@ -244,19 +271,26 @@ def test_add_param_group(debias_ewma):
# make sure we can run the model.
model = Sequential(model1, model2, model3).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0)
out.sum().backward()
loss = model(in_data_0).sum()
if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1)
out.sum().backward()
loss = model(in_data_1).sum()
if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
# make sure gains are right and we can step.
# the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
assert np.allclose(optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain()
assert np.allclose(optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0)
assert np.allclose(optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1)
assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2)
assert np.allclose(optim.gain(), 1.1382937715383077 if debias_ewma else 1.1391959826562015), optim.gain()
assert np.allclose(optim.gain(0), 1.142857206008338 if debias_ewma else 1.142857206006931), optim.gain(0)
assert np.allclose(optim.gain(1), 1.1116875516387468 if debias_ewma else 1.1116906378271827), optim.gain(1)
assert np.allclose(optim.gain(2), 1.0749164095196344), optim.gain(2)
optim.step()
optim.zero_grad()
......@@ -264,31 +298,45 @@ def test_add_param_group(debias_ewma):
@pytest.mark.parametrize(
"test_case",
[
{"new_accum": 3, "exp_gain": 1.2573902104603087},
{"new_accum": 6, "exp_gain": 1.0903738977361481},
{"new_accum": 9, "exp_gain": 1.0432658660558123},
{"num_grads_to_accum": 3, "exp_gain": 2.141385737279438},
{"num_grads_to_accum": 6, "exp_gain": 2.9927880097754036},
{"num_grads_to_accum": 9, "exp_gain": 3.4461759591877312},
],
)
def test_set_num_gradients_to_accumulate(test_case):
@pytest.mark.parametrize("is_scaled_loss", [True, False])
def test_set_num_gradients_to_accumulate(test_case, is_scaled_loss):
"""Test set_num_gradients_to_accumulate experimental feature."""
new_accum = test_case["new_accum"]
num_grads_to_accum = test_case["num_grads_to_accum"]
exp_gain = test_case["exp_gain"]
model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
out = model(Tensor([1.0, 0.0]))
out.sum().backward()
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
loss = model(Tensor([0.0, 1.0])).sum()
if is_scaled_loss:
loss = loss / 2
loss.backward()
loss = model(Tensor([1.0, 0.0])).sum()
if is_scaled_loss:
loss = loss / 2
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
assert np.allclose(optim.gain(), 2.0)
optim.step()
optim.zero_grad()
optim.set_scale(float(new_accum))
optim.set_num_gradients_to_accumulate(new_accum)
for _ in range(new_accum):
out = model(Tensor([0.0, 1.0]))
out.sum().backward()
optim.set_scale(float(num_grads_to_accum))
optim.set_num_gradients_to_accumulate(num_grads_to_accum)
for _ in range(num_grads_to_accum):
loss = model(Tensor([0.0, 1.0])).sum() / num_grads_to_accum
if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
assert np.allclose(optim.gain(), exp_gain), optim.gain()
optim.step()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment