Unverified Commit f5e727cc authored by Changyu Gao's avatar Changyu Gao Committed by GitHub
Browse files

Fix gradient accumulation (#1086)

* Fix gradient accumulation

- Add ``is_scaled_loss`` flag to support both scaled / unscaled loss
- Add a method `scale_grad_by_num_grads_to_accum`to handle gradient accumulation using unscaled loss more explicitly
- Fix ``test_grad_accum`` and``test_set_num_gradients_to_accumulate``
- Add tests for gradient
parent b0b92e70
...@@ -8,12 +8,42 @@ ...@@ -8,12 +8,42 @@
adascale_test_data = [ adascale_test_data = [
# "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1. # "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 4.0 / 3}, {
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846}, "input": [[1.0, 0], [0, 1.0]],
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0}, "expected_gain": 4.0 / 3,
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.4688796680497926}, "expected_grad": [[0.5, 0.5], [0.5, 0.5]],
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.8472893901708}, "expected_bias_grad": [1.0, 1.0],
},
{
"input": [[1.0, 1.0], [1.0, 1.0]],
"expected_gain": 1.0000001249999846,
"expected_grad": [[1.0, 1.0], [1.0, 1.0]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[-1.0, 1.0], [1.0, -1.0]],
"expected_gain": 2.0,
"expected_grad": [[0.0, 0.0], [0.0, 0.0]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[1.0, 4.0], [5.0, 0.5]],
"expected_gain": 1.4688796680497926,
"expected_grad": [[3.0, 2.25], [3.0, 2.25]],
"expected_bias_grad": [1.0, 1.0],
},
{
"input": [[-0.2, 3.0], [5.0, 0.5]],
"expected_gain": 1.8472893901708,
"expected_grad": [[2.4000000953674316, 1.75], [2.4000000953674316, 1.75]],
"expected_bias_grad": [1.0, 1.0],
},
# "inputs" to trigger multiple iteration tests, which make sure the # "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered. # smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.6720968158031417}, {
"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]],
"expected_gain": 1.6720968158031417,
"expected_grad": [[2.049999952316284, 2.049999952316284], [2.049999952316284, 2.049999952316284]],
"expected_bias_grad": [1.0, 1.0],
},
] ]
...@@ -130,7 +130,12 @@ class AdaScale(Optimizer): ...@@ -130,7 +130,12 @@ class AdaScale(Optimizer):
between each optimizer step. This can be changed during between each optimizer step. This can be changed during
training as long as the train loop changes gradient accumulation training as long as the train loop changes gradient accumulation
accordingly. accordingly.
The loss in each pass can be either scaled or unscaled. See `is_scaled_loss` below.
Default to 1, which does not accumulate gradients. Default to 1, which does not accumulate gradients.
is_scaled_loss (bool):
If True, assume that the loss is scaled by `num_gradients_to_accumulate`.
If False, the loss is not scaled.
Default: True.
debias_ewma (bool): debias_ewma (bool):
(experimental) Use debias exponential moving average (experimental) Use debias exponential moving average
for smoothing and mu and sigma variables. False will for smoothing and mu and sigma variables. False will
...@@ -145,6 +150,7 @@ class AdaScale(Optimizer): ...@@ -145,6 +150,7 @@ class AdaScale(Optimizer):
scale: Optional[float] = None, scale: Optional[float] = None,
smoothing: float = None, smoothing: float = None,
num_gradients_to_accumulate: int = 1, num_gradients_to_accumulate: int = 1,
is_scaled_loss: bool = True,
debias_ewma: bool = True, debias_ewma: bool = True,
): ):
# Init hook_handles list, otherwise, a partial init'ed object may fail in ``__del__``. # Init hook_handles list, otherwise, a partial init'ed object may fail in ``__del__``.
...@@ -160,6 +166,7 @@ class AdaScale(Optimizer): ...@@ -160,6 +166,7 @@ class AdaScale(Optimizer):
self._last_final_backward_call = 0 self._last_final_backward_call = 0
self._num_grads_to_accum = num_gradients_to_accumulate self._num_grads_to_accum = num_gradients_to_accumulate
self._debias_ewma = debias_ewma self._debias_ewma = debias_ewma
self._is_scaled_loss = is_scaled_loss
# Proxy the param_groups so that `torch.optim.lr_scheduler` can work. # Proxy the param_groups so that `torch.optim.lr_scheduler` can work.
self.param_groups = self._optimizer.param_groups self.param_groups = self._optimizer.param_groups
...@@ -453,17 +460,22 @@ class AdaScale(Optimizer): ...@@ -453,17 +460,22 @@ class AdaScale(Optimizer):
total_grad_sqr = np.array( total_grad_sqr = np.array(
[sum(param.grad.pow(2).sum().item() for param in group["params"]) for group in self._optimizer.param_groups] [sum(param.grad.pow(2).sum().item() for param in group["params"]) for group in self._optimizer.param_groups]
) )
# Divide by (_num_grads_to_accum ** 2) to account for gradient
# accumulation.
if self._num_grads_to_accum > 1:
# np array doesn't support /=.
total_grad_sqr = total_grad_sqr / (self._num_grads_to_accum**2)
# Wait for all_reduce to be done and move it to cpu & np. # Wait for all_reduce to be done and move it to cpu & np.
if work: if work:
work.wait() work.wait()
local_grad_sqr = self._local_grad_sqr.cpu().numpy() local_grad_sqr = self._local_grad_sqr.cpu().numpy()
if self._num_grads_to_accum > 1:
# Handle scaling for for gradient accumulation
if self._is_scaled_loss:
# If loss is scaled down, we need to scale the local gradients back by a factor of _num_grads_to_accum squared;
# total_grad_sqr is already scaled by _num_grads_to_accum squared.
local_grad_sqr *= self._num_grads_to_accum**2
else:
# If loss is not scaled, local gradients are correct, but we need to scale the total_grad_sqr down to account for gradient accumulation.
total_grad_sqr /= self._num_grads_to_accum**2
# See appendix B.3 of the paper. # See appendix B.3 of the paper.
# Modified to handle cases where scale != world_size # Modified to handle cases where scale != world_size
# #
...@@ -509,7 +521,7 @@ class AdaScale(Optimizer): ...@@ -509,7 +521,7 @@ class AdaScale(Optimizer):
original_lr = [] original_lr = []
for idx, param_group in enumerate(self._optimizer.param_groups): for idx, param_group in enumerate(self._optimizer.param_groups):
original_lr.append(param_group["lr"]) original_lr.append(param_group["lr"])
param_group["lr"] = self.gain(pg_idx=idx) * param_group["lr"] param_group["lr"] *= self.gain(pg_idx=idx)
# Step it. # Step it.
res = self._optimizer.step(*args, **kwargs) res = self._optimizer.step(*args, **kwargs)
...@@ -606,6 +618,18 @@ class AdaScale(Optimizer): ...@@ -606,6 +618,18 @@ class AdaScale(Optimizer):
# not needed, so the smoothing factor is 0. # not needed, so the smoothing factor is 0.
self._smoothing = max(1 - self._world_size * self._num_grads_to_accum / 1000, 0) self._smoothing = max(1 - self._world_size * self._num_grads_to_accum / 1000, 0)
def scale_grad_by_num_grads_to_accum(self) -> None:
"""Scale the gradient down by the number of gradients to accumulate.
This should be called after the gradient accumulation is done and the unscaled loss is used.
"""
assert self._local_grad_sqr is None, "Only call this after backward"
assert self._num_grads_to_accum > 1, "Must be accumulating gradients"
assert not self._is_scaled_loss, "Must use unscaled loss"
for group in self._optimizer.param_groups:
for param in group["params"]:
param.grad.div_(self._num_grads_to_accum)
def __getattr__(self, name: str) -> Any: def __getattr__(self, name: str) -> Any:
"""Forward missing attributes to wrapped optimizer.""" """Forward missing attributes to wrapped optimizer."""
try: try:
......
...@@ -75,18 +75,25 @@ def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case): ...@@ -75,18 +75,25 @@ def _test_basic_func(rank, ddp_cls, world_size, tempfile_name, test_case):
out.sum().backward() out.sum().backward()
if ddp_cls is DDP: if ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain() assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
else: else:
# multiple iters # multiple iters
for in_data in test_case["inputs"]: n = len(test_case["inputs"])
for i, in_data in enumerate(test_case["inputs"]):
in_data = Tensor(in_data[rank]).cuda() in_data = Tensor(in_data[rank]).cuda()
out = model(in_data) out = model(in_data)
out.sum().backward() out.sum().backward()
if i == n - 1 and ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
if ddp_cls is DDP:
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
dist.destroy_process_group() dist.destroy_process_group()
......
...@@ -61,7 +61,8 @@ def test_loss_accum_cpu(): ...@@ -61,7 +61,8 @@ def test_loss_accum_cpu():
@pytest.mark.parametrize("cpu", [True, False]) @pytest.mark.parametrize("cpu", [True, False])
@pytest.mark.parametrize("test_case", adascale_test_data) @pytest.mark.parametrize("test_case", adascale_test_data)
def test_grad_accum(test_case, cpu): @pytest.mark.parametrize("is_scaled_loss", [True, False])
def test_grad_accum(test_case, cpu, is_scaled_loss):
"""Test the basic functionality on CPU/GPU with gradient accumulation without DDP""" """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
make_cudnn_deterministic() make_cudnn_deterministic()
model = Linear(2, 2, bias=True) model = Linear(2, 2, bias=True)
...@@ -69,7 +70,7 @@ def test_grad_accum(test_case, cpu): ...@@ -69,7 +70,7 @@ def test_grad_accum(test_case, cpu):
if torch.cuda.device_count() < 1: if torch.cuda.device_count() < 1:
pytest.skip("1 GPU is required") pytest.skip("1 GPU is required")
model = model.cuda() model = model.cuda()
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
expected_gain = test_case["expected_gain"] expected_gain = test_case["expected_gain"]
if "input" in test_case: if "input" in test_case:
data = [test_case["input"]] * 2 data = [test_case["input"]] * 2
...@@ -82,16 +83,27 @@ def test_grad_accum(test_case, cpu): ...@@ -82,16 +83,27 @@ def test_grad_accum(test_case, cpu):
in_data_0 = Tensor(in_data[0]) in_data_0 = Tensor(in_data[0])
if not cpu: if not cpu:
in_data_0 = in_data_0.cuda() in_data_0 = in_data_0.cuda()
out = model(in_data_0) loss = model(in_data_0).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / 2
loss.backward()
# grad pass 2 # grad pass 2
in_data_1 = Tensor(in_data[1]) in_data_1 = Tensor(in_data[1])
if not cpu: if not cpu:
in_data_1 = in_data_1.cuda() in_data_1 = in_data_1.cuda()
out = model(in_data_1) loss = model(in_data_1).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / 2
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
if exp_gain is not None: if exp_gain is not None:
assert np.allclose(optim.gain(), exp_gain), optim.gain() assert np.allclose(optim.gain(), exp_gain), optim.gain()
w, b = model.parameters()
assert np.allclose(w.grad.cpu(), test_case["expected_grad"]), w.grad
assert np.allclose(b.grad.cpu(), test_case["expected_bias_grad"]), b.grad
# stepping it. Note that if we did more than 2 passes as promised by the # stepping it. Note that if we did more than 2 passes as promised by the
# num_gradients_to_accumulate argument above, AdaScale is not be able to # num_gradients_to_accumulate argument above, AdaScale is not be able to
# detect that mistake for now. The result will just be wrong in that case. # detect that mistake for now. The result will just be wrong in that case.
...@@ -110,14 +122,14 @@ def test_state_checkpointing(): ...@@ -110,14 +122,14 @@ def test_state_checkpointing():
a unit test for checkpointing with DDP. a unit test for checkpointing with DDP.
""" """
# Constants. # Constants.
accum_steps = 3 num_grads_to_accum = 3
in_dim = 5 in_dim = 5
# Setup. # Setup.
def make_model_and_optim(): def make_model_and_optim():
model = Linear(in_dim, 2, bias=False) model = Linear(in_dim, 2, bias=False)
model = model.cuda() model = model.cuda()
optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=accum_steps) optim = AdaScale(SGD(model.parameters(), lr=0.1, momentum=0.9), num_gradients_to_accumulate=num_grads_to_accum)
return model, optim return model, optim
model, optim = make_model_and_optim() model, optim = make_model_and_optim()
...@@ -127,7 +139,7 @@ def test_state_checkpointing(): ...@@ -127,7 +139,7 @@ def test_state_checkpointing():
data = [] data = []
replay_data_idx = 0 replay_data_idx = 0
for _ in range(6): # run some steps for _ in range(6): # run some steps
for i in range(accum_steps): for i in range(num_grads_to_accum):
if replay_data is None: if replay_data is None:
in_data = torch.rand(in_dim).cuda() in_data = torch.rand(in_dim).cuda()
data.append(in_data) data.append(in_data)
...@@ -136,7 +148,7 @@ def test_state_checkpointing(): ...@@ -136,7 +148,7 @@ def test_state_checkpointing():
replay_data_idx += 1 replay_data_idx += 1
out = model(in_data) out = model(in_data)
out.sum().backward() out.sum().backward()
if i == accum_steps - 1: if i == num_grads_to_accum - 1:
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
return out, data return out, data
...@@ -172,13 +184,14 @@ def test_state_checkpointing(): ...@@ -172,13 +184,14 @@ def test_state_checkpointing():
def test_lr_scheduler(): def test_lr_scheduler():
"""Test AdaScale working with torch.optim.lr_scheduler.""" """Test AdaScale working with torch.optim.lr_scheduler."""
num_grads_to_accum = 3
model = Linear(2, 2, bias=False) model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=3) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=num_grads_to_accum)
# We use 1, not 0.1 here since scheduler.step() is called here first. # We use 1, not 0.1 here since scheduler.step() is called here first.
scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch) scheduler = LambdaLR(optim, lr_lambda=lambda epoch: 1 / 10**epoch)
for epoch in range(3): for epoch in range(3):
for data_idx in range(10): for data_idx in range(10):
for accumulation in range(3): for accumulation in range(num_grads_to_accum):
in_data = torch.rand(2) in_data = torch.rand(2)
loss = model(in_data).sum() loss = model(in_data).sum()
loss.backward() loss.backward()
...@@ -194,8 +207,10 @@ def test_lr_scheduler(): ...@@ -194,8 +207,10 @@ def test_lr_scheduler():
@skip_if_no_cuda @skip_if_no_cuda
@pytest.mark.parametrize("debias_ewma", [True, False]) @pytest.mark.parametrize("debias_ewma", [True, False])
def test_add_param_group(debias_ewma): @pytest.mark.parametrize("is_scaled_loss", [True, False])
"""Test AdaScale supports add_param_group() API.""" def test_add_param_group(debias_ewma, is_scaled_loss):
"""Test AdaScale supports add_param_group() API for both scaled and unscaled loss."""
num_grads_to_accum = 2
model1 = Linear(2, 2, bias=True) model1 = Linear(2, 2, bias=True)
with torch.no_grad(): with torch.no_grad():
# make weights and bias deterministic, which is needed for # make weights and bias deterministic, which is needed for
...@@ -203,7 +218,12 @@ def test_add_param_group(debias_ewma): ...@@ -203,7 +218,12 @@ def test_add_param_group(debias_ewma):
# parameters from other layers. # parameters from other layers.
model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2)) model1.weight.copy_(Tensor([1.0, 2.0, 3.0, 4.0]).reshape(2, 2))
model1.bias.fill_(0.1) model1.bias.fill_(0.1)
optim = AdaScale(SGD(model1.parameters(), lr=0.1), num_gradients_to_accumulate=2, debias_ewma=debias_ewma) optim = AdaScale(
SGD(model1.parameters(), lr=0.1),
num_gradients_to_accumulate=2,
is_scaled_loss=is_scaled_loss,
debias_ewma=debias_ewma,
)
assert len(optim._hook_handles) == 2, len(optim._hook_handles) assert len(optim._hook_handles) == 2, len(optim._hook_handles)
model2 = Linear(2, 3, bias=True) model2 = Linear(2, 3, bias=True)
...@@ -217,12 +237,19 @@ def test_add_param_group(debias_ewma): ...@@ -217,12 +237,19 @@ def test_add_param_group(debias_ewma):
# make sure we can run the model. # make sure we can run the model.
model = Sequential(model1, model2).cuda() model = Sequential(model1, model2).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0) loss = model(in_data_0).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
in_data_1 = Tensor([3.0, 4.0]).cuda() in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1) loss = model(in_data_1).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
# make sure the gains are right and we can step. # make sure the gains are right and we can step.
# since this is the first step, debias_ewma doesn't affect the value. # since this is the first step, debias_ewma doesn't affect the value.
...@@ -244,19 +271,26 @@ def test_add_param_group(debias_ewma): ...@@ -244,19 +271,26 @@ def test_add_param_group(debias_ewma):
# make sure we can run the model. # make sure we can run the model.
model = Sequential(model1, model2, model3).cuda() model = Sequential(model1, model2, model3).cuda()
in_data_0 = Tensor([1.0, 2.0]).cuda() in_data_0 = Tensor([1.0, 2.0]).cuda()
out = model(in_data_0) loss = model(in_data_0).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
in_data_1 = Tensor([3.0, 4.0]).cuda() in_data_1 = Tensor([3.0, 4.0]).cuda()
out = model(in_data_1) loss = model(in_data_1).sum()
out.sum().backward() if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
# make sure gains are right and we can step. # make sure gains are right and we can step.
# the last PG's gain is not affected by debias_ewma since it is the first step for that PG. # the last PG's gain is not affected by debias_ewma since it is the first step for that PG.
assert np.allclose(optim.gain(), 1.1191193589460822 if debias_ewma else 1.1192783954732368), optim.gain() assert np.allclose(optim.gain(), 1.1382937715383077 if debias_ewma else 1.1391959826562015), optim.gain()
assert np.allclose(optim.gain(0), 1.1428571880897151 if debias_ewma else 1.142857188085096), optim.gain(0) assert np.allclose(optim.gain(0), 1.142857206008338 if debias_ewma else 1.142857206006931), optim.gain(0)
assert np.allclose(optim.gain(1), 1.1167103578364508 if debias_ewma else 1.1167104954034948), optim.gain(1) assert np.allclose(optim.gain(1), 1.1116875516387468 if debias_ewma else 1.1116906378271827), optim.gain(1)
assert np.allclose(optim.gain(2), 1.117381091722702), optim.gain(2) assert np.allclose(optim.gain(2), 1.0749164095196344), optim.gain(2)
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
...@@ -264,31 +298,45 @@ def test_add_param_group(debias_ewma): ...@@ -264,31 +298,45 @@ def test_add_param_group(debias_ewma):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_case", "test_case",
[ [
{"new_accum": 3, "exp_gain": 1.2573902104603087}, {"num_grads_to_accum": 3, "exp_gain": 2.141385737279438},
{"new_accum": 6, "exp_gain": 1.0903738977361481}, {"num_grads_to_accum": 6, "exp_gain": 2.9927880097754036},
{"new_accum": 9, "exp_gain": 1.0432658660558123}, {"num_grads_to_accum": 9, "exp_gain": 3.4461759591877312},
], ],
) )
def test_set_num_gradients_to_accumulate(test_case): @pytest.mark.parametrize("is_scaled_loss", [True, False])
def test_set_num_gradients_to_accumulate(test_case, is_scaled_loss):
"""Test set_num_gradients_to_accumulate experimental feature.""" """Test set_num_gradients_to_accumulate experimental feature."""
new_accum = test_case["new_accum"] num_grads_to_accum = test_case["num_grads_to_accum"]
exp_gain = test_case["exp_gain"] exp_gain = test_case["exp_gain"]
model = Linear(2, 2, bias=False) model = Linear(2, 2, bias=False)
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2, is_scaled_loss=is_scaled_loss)
out = model(Tensor([0.0, 1.0])) loss = model(Tensor([0.0, 1.0])).sum()
out.sum().backward() if is_scaled_loss:
out = model(Tensor([1.0, 0.0])) loss = loss / 2
out.sum().backward() loss.backward()
loss = model(Tensor([1.0, 0.0])).sum()
if is_scaled_loss:
loss = loss / 2
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
assert np.allclose(optim.gain(), 2.0) assert np.allclose(optim.gain(), 2.0)
optim.step() optim.step()
optim.zero_grad() optim.zero_grad()
optim.set_scale(float(new_accum)) optim.set_scale(float(num_grads_to_accum))
optim.set_num_gradients_to_accumulate(new_accum) optim.set_num_gradients_to_accumulate(num_grads_to_accum)
for _ in range(new_accum): for _ in range(num_grads_to_accum):
out = model(Tensor([0.0, 1.0])) loss = model(Tensor([0.0, 1.0])).sum() / num_grads_to_accum
out.sum().backward() if is_scaled_loss:
loss = loss / num_grads_to_accum
loss.backward()
if not is_scaled_loss:
optim.scale_grad_by_num_grads_to_accum()
assert np.allclose(optim.gain(), exp_gain), optim.gain() assert np.allclose(optim.gain(), exp_gain), optim.gain()
optim.step() optim.step()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment