Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
088ad458
Commit
088ad458
authored
Nov 03, 2018
by
thomwolf
Browse files
fixing optimization
parent
852e4b3c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
85 additions
and
49 deletions
+85
-49
optimization_pytorch.py
optimization_pytorch.py
+60
-39
optimization_test_pytorch.py
optimization_test_pytorch.py
+8
-3
run_classifier_pytorch.py
run_classifier_pytorch.py
+8
-4
tensorflow_code/optimization_test.py
tensorflow_code/optimization_test.py
+9
-3
No files found.
optimization_pytorch.py
View file @
088ad458
...
...
@@ -4,16 +4,19 @@ from torch.optim import Optimizer
from
torch.nn.utils
import
clip_grad_norm_
def
warmup_cosine
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
(
0.5
*
(
1
+
torch
.
cos
(
math
.
pi
*
x
)))
if
x
<
warmup
:
return
x
/
warmup
return
0.5
*
(
1.0
+
torch
.
cos
(
math
.
pi
*
x
))
def
warmup_constant
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
1
if
x
<
warmup
:
return
x
/
warmup
return
1.0
def
warmup_linear
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
(
s
*
(
x
/
warmup
)
+
(
1
-
s
))
*
(
1
-
x
)
if
x
<
warmup
:
return
x
/
warmup
return
1.0
-
x
SCHEDULES
=
{
'warmup_cosine'
:
warmup_cosine
,
...
...
@@ -24,24 +27,34 @@ SCHEDULES = {
class
BERTAdam
(
Optimizer
):
"""Implements Open AI version of Adam algorithm with weight decay fix.
Params:
lr,
warmup=-1,
t_total=-1,
schedule='warmup_linear',
b1=0.9,
b2=0.999,
e=1e-6,
weight_decay_rate=0.01,
max_grad_norm=1.0
"""
def
__init__
(
self
,
params
,
lr
,
schedule
,
warmup
,
t_total
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
l2
=
0
,
vector_l2
=
False
,
max_grad_norm
=
-
1
,
**
kwargs
):
if
not
0.0
<=
lr
:
raise
ValueError
(
"Invalid learning rate: {}"
.
format
(
lr
))
def
__init__
(
self
,
params
,
lr
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'
warmup
_linear'
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay_rate
=
0.01
,
max_grad_norm
=
1.0
):
if
not
lr
>=
0.0
:
raise
ValueError
(
"Invalid learning rate: {}
- should be >= 0.0
"
.
format
(
lr
))
if
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0
<=
warmup
:
raise
ValueError
(
"Invalid warmup: {}"
.
format
(
warmup
))
if
not
0
.0
<=
warmup
<
1.0
and
not
warmup
==
-
1
:
raise
ValueError
(
"Invalid warmup: {}
- should be in [0.0, 1.0[ or -1
"
.
format
(
warmup
))
if
not
0.0
<=
b1
<
1.0
:
raise
ValueError
(
"Invalid b1 parameter: {}"
.
format
(
b1
))
raise
ValueError
(
"Invalid b1 parameter: {}
- should be in [0.0, 1.0[
"
.
format
(
b1
))
if
not
0.0
<=
b2
<
1.0
:
raise
ValueError
(
"Invalid b2 parameter: {}"
.
format
(
b2
))
if
not
0.0
<=
e
:
raise
ValueError
(
"Invalid epsilon value: {}"
.
format
(
e
))
raise
ValueError
(
"Invalid b2 parameter: {}
- should be in [0.0, 1.0[
"
.
format
(
b2
))
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {}
- should be >= 0.0
"
.
format
(
e
))
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
warmup
=
warmup
,
t_total
=
t_total
,
b1
=
b1
,
b2
=
b2
,
e
=
e
,
l2
=
l2
,
vector_l2
=
vector_l2
,
b1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay_rate
=
weight_decay_rate
,
max_grad_norm
=
max_grad_norm
)
super
(
BERTAdam
,
self
).
__init__
(
params
,
defaults
)
...
...
@@ -52,8 +65,11 @@ class BERTAdam(Optimizer):
state
=
self
.
state
[
p
]
if
len
(
state
)
==
0
:
return
[
0
]
if
group
[
't_total'
]
!=
-
1
:
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
else
:
lr_scheduled
=
group
[
'lr'
]
lr
.
append
(
lr_scheduled
)
return
lr
...
...
@@ -103,32 +119,22 @@ class BERTAdam(Optimizer):
if
len
(
state
)
==
0
:
state
[
'step'
]
=
0
# Exponential moving average of gradient values
state
[
'ex
p_avg
'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'
n
ex
t_m
'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of squared gradient values
state
[
'ex
p_avg_sq
'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'
n
ex
t_v
'
]
=
torch
.
zeros_like
(
p
.
data
)
ex
p_avg
,
ex
p_avg_sq
=
state
[
'ex
p_avg
'
],
state
[
'ex
p_avg_sq
'
]
n
ex
t_m
,
n
ex
t_v
=
state
[
'
n
ex
t_m
'
],
state
[
'
n
ex
t_v
'
]
beta1
,
beta2
=
group
[
'b1'
],
group
[
'b2'
]
state
[
'step'
]
+=
1
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
clip_grad_norm_
(
p
,
group
[
'max_grad_norm'
])
# Decay the first and second moment running average coefficient
exp_avg
.
mul_
(
beta1
).
add_
(
1
-
beta1
,
grad
)
exp_avg_sq
.
mul_
(
beta2
).
addcmul_
(
1
-
beta2
,
grad
,
grad
)
denom
=
exp_avg_sq
.
sqrt
().
add_
(
group
[
'e'
])
bias_correction1
=
1
-
beta1
**
state
[
'step'
]
bias_correction2
=
1
-
beta2
**
state
[
'step'
]
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
step_size
=
lr_scheduled
*
math
.
sqrt
(
bias_correction2
)
/
bias_correction1
p
.
data
.
addcdiv_
(
-
step_size
,
exp_avg
,
denom
)
# In-place operations to update the averages at the same time
next_m
.
mul_
(
beta1
).
add_
(
1
-
beta1
,
grad
)
next_v
.
mul_
(
beta2
).
addcmul_
(
1
-
beta2
,
grad
,
grad
)
update
=
next_m
/
(
next_v
.
sqrt
()
+
group
[
'e'
])
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
...
...
@@ -137,7 +143,22 @@ class BERTAdam(Optimizer):
# Instead we want ot decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if
(
len
(
p
.
size
())
>
1
or
group
[
'vector_l2'
])
and
group
[
'l2'
]
>
0
:
p
.
data
.
add_
(
-
lr_scheduled
*
group
[
'l2'
],
p
.
data
)
if
group
[
'weight_decay_rate'
]
>
0.0
:
update
+=
group
[
'weight_decay_rate'
]
*
p
.
data
if
group
[
't_total'
]
!=
-
1
:
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
else
:
lr_scheduled
=
group
[
'lr'
]
update_with_lr
=
lr_scheduled
*
update
p
.
data
.
add_
(
-
update_with_lr
)
state
[
'step'
]
+=
1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
return
loss
optimization_test_pytorch.py
View file @
088ad458
...
...
@@ -31,13 +31,18 @@ class OptimizationTest(unittest.TestCase):
def
test_adam
(
self
):
w
=
torch
.
tensor
([
0.1
,
-
0.2
,
-
0.1
],
requires_grad
=
True
)
x
=
torch
.
tensor
([
0.4
,
0.2
,
-
0.5
])
target
=
torch
.
tensor
([
0.4
,
0.2
,
-
0.5
])
criterion
=
torch
.
nn
.
MSELoss
(
reduction
=
'elementwise_mean'
)
optimizer
=
optimization
.
BERTAdam
(
params
=
{
w
},
lr
=
0.2
,
schedule
=
'warmup_linear'
,
warmup
=
0.1
,
t_total
=
100
)
# No warmup, constant schedule, no gradient clipping
optimizer
=
optimization
.
BERTAdam
(
params
=
[
w
],
lr
=
2e-1
,
weight_decay_rate
=
0.0
,
max_grad_norm
=-
1
)
for
_
in
range
(
100
):
loss
=
criterion
(
w
,
x
)
loss
=
criterion
(
w
,
target
)
loss
.
backward
()
optimizer
.
step
()
w
.
grad
.
detach_
()
# No zero_grad() function on simple tensors. we do it ourselves.
w
.
grad
.
zero_
()
self
.
assertListAlmostEqual
(
w
.
tolist
(),
[
0.4
,
0.2
,
-
0.5
],
tol
=
1e-2
)
...
...
run_classifier_pytorch.py
View file @
088ad458
...
...
@@ -483,10 +483,14 @@ def main():
model
.
bert
.
load_state_dict
(
torch
.
load
(
args
.
init_checkpoint
,
map_location
=
'cpu'
))
model
.
to
(
device
)
optimizer
=
BERTAdam
([{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
!=
'bias'
],
'l2'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
==
'bias'
],
'l2'
:
0.
}
],
lr
=
args
.
learning_rate
,
schedule
=
'warmup_linear'
,
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
optimizer_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
not
in
no_decay
],
'weight_decay_rate'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
in
no_decay
],
'weight_decay_rate'
:
0.0
}
]
optimizer
=
BERTAdam
(
optimizer_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_steps
)
...
...
tensorflow_code/optimization_test.py
View file @
088ad458
...
...
@@ -38,10 +38,16 @@ class OptimizationTest(tf.test.TestCase):
init_op
=
tf
.
group
(
tf
.
global_variables_initializer
(),
tf
.
local_variables_initializer
())
sess
.
run
(
init_op
)
for
_
in
range
(
100
):
np_w
=
sess
.
run
(
w
)
np_loss
=
sess
.
run
(
loss
)
np_grad
=
sess
.
run
(
grads
)[
0
]
for
i
in
range
(
100
):
print
(
i
)
sess
.
run
(
train_op
)
w_np
=
sess
.
run
(
w
)
self
.
assertAllClose
(
w_np
.
flat
,
[
0.4
,
0.2
,
-
0.5
],
rtol
=
1e-2
,
atol
=
1e-2
)
np_w
=
sess
.
run
(
w
)
np_loss
=
sess
.
run
(
loss
)
np_grad
=
sess
.
run
(
grads
)[
0
]
self
.
assertAllClose
(
np_w
.
flat
,
[
0.4
,
0.2
,
-
0.5
],
rtol
=
1e-2
,
atol
=
1e-2
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment