"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "27d0e01d755dc14309617c06eb4d55c246183c98"
Unverified Commit 8672bcda authored by CeShine Lee's avatar CeShine Lee Committed by GitHub
Browse files

Adafactor: avoid updating group["lr"] attributes (#9751)

This affects Adafactor with relative_step=False and scale_parameter=True.
Updating group["lr"] makes the result of ._get_lr() depends on the previous call,
i.e., on the scale of other parameters. This isn't supposed to happen.
parent 115d97dd
......@@ -546,7 +546,7 @@ class Adafactor(Optimizer):
state["step"] += 1
state["RMS"] = self._rms(p_data_fp32)
group["lr"] = self._get_lr(group, state)
lr = self._get_lr(group, state)
beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
update = (grad ** 2) + group["eps"][0]
......@@ -567,7 +567,7 @@ class Adafactor(Optimizer):
update = exp_avg_sq.rsqrt().mul_(grad)
update.div_((self._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
update.mul_(group["lr"])
update.mul_(lr)
if use_first_moment:
exp_avg = state["exp_avg"]
......@@ -575,7 +575,7 @@ class Adafactor(Optimizer):
update = exp_avg
if group["weight_decay"] != 0:
p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
p_data_fp32.add_(-group["weight_decay"] * lr, p_data_fp32)
p_data_fp32.add_(-update)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment