Commit 886cb497 authored by thomwolf's avatar thomwolf
Browse files

updating readme and notebooks

parent fd647e8c
This diff is collapsed.
This diff is collapsed.
...@@ -42,7 +42,7 @@ SCHEDULES = { ...@@ -42,7 +42,7 @@ SCHEDULES = {
class BERTAdam(Optimizer): class BERTAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix (and no ). """Implements BERT version of Adam algorithm with weight decay fix.
Params: Params:
lr: learning rate lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
...@@ -136,7 +136,7 @@ class BERTAdam(Optimizer): ...@@ -136,7 +136,7 @@ class BERTAdam(Optimizer):
# the correct way of using L2 regularization/weight decay with Adam, # the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways. # since that will interact with the m and v parameters in strange ways.
# #
# Instead we want ot decay the weights in a manner that doesn't interact # Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square # with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD. # of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay_rate'] > 0.0: if group['weight_decay_rate'] > 0.0:
...@@ -154,6 +154,7 @@ class BERTAdam(Optimizer): ...@@ -154,6 +154,7 @@ class BERTAdam(Optimizer):
state['step'] += 1 state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step'] # bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step'] # bias_correction2 = 1 - beta2 ** state['step']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment