Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e1bb7904
Commit
e1bb7904
authored
Nov 01, 2018
by
thomwolf
Browse files
optimization_pytorch from OpenAI with adapted HP - check if there is weights decay on bias
parent
d3a8df6b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
143 additions
and
0 deletions
+143
-0
optimization_pytorch.py
optimization_pytorch.py
+143
-0
No files found.
optimization_pytorch.py
0 → 100644
View file @
e1bb7904
import
math
import
torch
from
torch.optim
import
Optimizer
from
torch.nn.utils
import
clip_grad_norm_
def
warmup_cosine
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
(
0.5
*
(
1
+
torch
.
cos
(
math
.
pi
*
x
)))
def
warmup_constant
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
1
def
warmup_linear
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
(
s
*
(
x
/
warmup
)
+
(
1
-
s
))
*
(
1
-
x
)
SCHEDULES
=
{
'warmup_cosine'
:
warmup_cosine
,
'warmup_constant'
:
warmup_constant
,
'warmup_linear'
:
warmup_linear
,
}
class
OpenAIAdam
(
Optimizer
):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
def
__init__
(
self
,
params
,
lr
,
schedule
,
warmup
,
t_total
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
l2
=
0
,
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
if
not
0.0
<=
lr
:
raise
ValueError
(
"Invalid learning rate: {}"
.
format
(
lr
))
if
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0
<=
warmup
:
raise
ValueError
(
"Invalid warmup: {}"
.
format
(
warmup
))
if
not
0.0
<=
b1
<
1.0
:
raise
ValueError
(
"Invalid b1 parameter: {}"
.
format
(
b1
))
if
not
0.0
<=
b2
<
1.0
:
raise
ValueError
(
"Invalid b2 parameter: {}"
.
format
(
b2
))
if
not
0.0
<=
e
:
raise
ValueError
(
"Invalid epsilon value: {}"
.
format
(
e
))
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
warmup
=
warmup
,
t_total
=
t_total
,
b1
=
b1
,
b2
=
b2
,
e
=
e
,
l2
=
l2
,
vector_l2
=
vector_l2
,
max_grad_norm
=
max_grad_norm
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
def
get_lr
(
self
):
lr
=
[]
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
state
=
self
.
state
[
p
]
if
len
(
state
)
==
0
:
return
[
0
]
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
lr
.
append
(
lr_scheduled
)
return
lr
def
to
(
self
,
device
):
""" Move the optimizer state to a specified device"""
for
state
in
self
.
state
.
values
():
state
[
'exp_avg'
].
to
(
device
)
state
[
'exp_avg_sq'
].
to
(
device
)
def
initialize_step
(
self
,
initial_step
):
"""Initialize state with a defined step (but we don't have stored averaged).
Arguments:
initial_step (int): Initial step number.
"""
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
state
=
self
.
state
[
p
]
# State initialization
state
[
'step'
]
=
initial_step
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss
=
None
if
closure
is
not
None
:
loss
=
closure
()
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
grad
=
p
.
grad
.
data
if
grad
.
is_sparse
:
raise
RuntimeError
(
'Adam does not support sparse gradients, please consider SparseAdam instead'
)
state
=
self
.
state
[
p
]
# State initialization
if
len
(
state
)
==
0
:
state
[
'step'
]
=
0
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
beta1
,
beta2
=
group
[
'b1'
],
group
[
'b2'
]
state
[
'step'
]
+=
1
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
clip_grad_norm_
(
p
,
group
[
'max_grad_norm'
])
# Decay the first and second moment running average coefficient
exp_avg
.
mul_
(
beta1
).
add_
(
1
-
beta1
,
grad
)
exp_avg_sq
.
mul_
(
beta2
).
addcmul_
(
1
-
beta2
,
grad
,
grad
)
denom
=
exp_avg_sq
.
sqrt
().
add_
(
group
[
'e'
])
bias_correction1
=
1
-
beta1
**
state
[
'step'
]
bias_correction2
=
1
-
beta2
**
state
[
'step'
]
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
step_size
=
lr_scheduled
*
math
.
sqrt
(
bias_correction2
)
/
bias_correction1
p
.
data
.
addcdiv_
(
-
step_size
,
exp_avg
,
denom
)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want ot decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if
(
len
(
p
.
size
())
>
1
or
group
[
'vector_l2'
])
and
group
[
'l2'
]
>
0
:
p
.
data
.
add_
(
-
lr_scheduled
*
group
[
'l2'
],
p
.
data
)
return
loss
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment