Commit 886cb497 authored by thomwolf's avatar thomwolf
Browse files

updating readme and notebooks

parent fd647e8c
This diff is collapsed.
This diff is collapsed.
......@@ -42,7 +42,7 @@ SCHEDULES = {
class BERTAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix (and no ).
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
......@@ -136,7 +136,7 @@ class BERTAdam(Optimizer):
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want ot decay the weights in a manner that doesn't interact
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay_rate'] > 0.0:
......@@ -154,6 +154,7 @@ class BERTAdam(Optimizer):
state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment