Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c9885903
Commit
c9885903
authored
Jun 25, 2019
by
tonianelope
Browse files
update betas to follow pytorch convention
parent
98dc30b2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
18 deletions
+16
-18
README.md
README.md
+1
-2
pytorch_pretrained_bert/optimization.py
pytorch_pretrained_bert/optimization.py
+8
-9
pytorch_pretrained_bert/optimization_openai.py
pytorch_pretrained_bert/optimization_openai.py
+7
-7
No files found.
README.md
View file @
c9885903
...
@@ -1094,8 +1094,7 @@ The optimizer accepts the following arguments:
...
@@ -1094,8 +1094,7 @@ The optimizer accepts the following arguments:
Can be
`'warmup_linear'`
,
`'warmup_constant'`
,
`'warmup_cosine'`
,
`'none'`
,
`None`
or a
`_LRSchedule`
object (see below).
Can be
`'warmup_linear'`
,
`'warmup_constant'`
,
`'warmup_cosine'`
,
`'none'`
,
`None`
or a
`_LRSchedule`
object (see below).
If
`None`
or
`'none'`
, learning rate is always kept constant.
If
`None`
or
`'none'`
, learning rate is always kept constant.
Default :
`'warmup_linear'`
Default :
`'warmup_linear'`
-
`b1`
: Adams b1. Default :
`0.9`
-
`betas`
: Adams betas. Default :
`0.9, 0.999`
-
`b2`
: Adams b2. Default :
`0.999`
-
`e`
: Adams epsilon. Default :
`1e-6`
-
`e`
: Adams epsilon. Default :
`1e-6`
-
`weight_decay:`
Weight decay. Default :
`0.01`
-
`weight_decay:`
Weight decay. Default :
`0.01`
-
`max_grad_norm`
: Maximum norm for the gradients (
`-1`
means no clipping). Default :
`1.0`
-
`max_grad_norm`
: Maximum norm for the gradients (
`-1`
means no clipping). Default :
`1.0`
...
...
pytorch_pretrained_bert/optimization.py
View file @
c9885903
...
@@ -191,22 +191,21 @@ class BertAdam(Optimizer):
...
@@ -191,22 +191,21 @@ class BertAdam(Optimizer):
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
If `None` or `'none'`, learning rate is always kept constant.
If `None` or `'none'`, learning rate is always kept constant.
Default : `'warmup_linear'`
Default : `'warmup_linear'`
b1: Adams b1. Default: 0.9
betas: Adams betas. Default: (0.9, 0.999)
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
"""
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
b
1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay
=
0.01
,
max_grad_norm
=
1.0
,
**
kwargs
):
b
etas
=
(
0.9
,
0.999
)
,
e
=
1e-6
,
weight_decay
=
0.01
,
max_grad_norm
=
1.0
,
**
kwargs
):
if
lr
is
not
required
and
lr
<
0.0
:
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0.0
<=
b
1
<
1.0
:
if
not
0.0
<=
b
etas
[
0
]
<
1.0
:
raise
ValueError
(
"Invalid b
1
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
1
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 0
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
0
]
))
if
not
0.0
<=
b
2
<
1.0
:
if
not
0.0
<=
b
etas
[
1
]
<
1.0
:
raise
ValueError
(
"Invalid b
2
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
2
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 1
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
1
]
))
if
not
e
>=
0.0
:
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
# initialize schedule object
# initialize schedule object
...
@@ -218,7 +217,7 @@ class BertAdam(Optimizer):
...
@@ -218,7 +217,7 @@ class BertAdam(Optimizer):
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object."
)
"Please specify custom warmup and t_total in _LRSchedule object."
)
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
b
1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay
=
weight_decay
,
b
etas
=
betas
,
e
=
e
,
weight_decay
=
weight_decay
,
max_grad_norm
=
max_grad_norm
)
max_grad_norm
=
max_grad_norm
)
super
(
BertAdam
,
self
).
__init__
(
params
,
defaults
)
super
(
BertAdam
,
self
).
__init__
(
params
,
defaults
)
...
@@ -264,7 +263,7 @@ class BertAdam(Optimizer):
...
@@ -264,7 +263,7 @@ class BertAdam(Optimizer):
state
[
'next_v'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'next_v'
]
=
torch
.
zeros_like
(
p
.
data
)
next_m
,
next_v
=
state
[
'next_m'
],
state
[
'next_v'
]
next_m
,
next_v
=
state
[
'next_m'
],
state
[
'next_v'
]
beta1
,
beta2
=
group
[
'b
1'
],
group
[
'b2
'
]
beta1
,
beta2
=
group
[
'b
etas
'
]
# Add grad clipping
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
if
group
[
'max_grad_norm'
]
>
0
:
...
...
pytorch_pretrained_bert/optimization_openai.py
View file @
c9885903
...
@@ -30,16 +30,16 @@ class OpenAIAdam(Optimizer):
...
@@ -30,16 +30,16 @@ class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
"""
def
__init__
(
self
,
params
,
lr
=
required
,
schedule
=
'warmup_linear'
,
warmup
=-
1
,
t_total
=-
1
,
def
__init__
(
self
,
params
,
lr
=
required
,
schedule
=
'warmup_linear'
,
warmup
=-
1
,
t_total
=-
1
,
b
1
=
0.9
,
b2
=
0.999
,
e
=
1e-8
,
weight_decay
=
0
,
b
etas
=
(
0.9
,
0.999
)
,
e
=
1e-8
,
weight_decay
=
0
,
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
if
lr
is
not
required
and
lr
<
0.0
:
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0.0
<=
b
1
<
1.0
:
if
not
0.0
<=
b
etas
[
0
]
<
1.0
:
raise
ValueError
(
"Invalid b
1
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
1
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 0
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
0
]
))
if
not
0.0
<=
b
2
<
1.0
:
if
not
0.0
<=
b
etas
[
1
]
<
1.0
:
raise
ValueError
(
"Invalid b
2
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
2
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 1
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
1
]
))
if
not
e
>=
0.0
:
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
# initialize schedule object
# initialize schedule object
...
@@ -51,7 +51,7 @@ class OpenAIAdam(Optimizer):
...
@@ -51,7 +51,7 @@ class OpenAIAdam(Optimizer):
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object."
)
"Please specify custom warmup and t_total in _LRSchedule object."
)
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
b
1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay
=
weight_decay
,
vector_l2
=
vector_l2
,
b
etas
=
betas
,
e
=
e
,
weight_decay
=
weight_decay
,
vector_l2
=
vector_l2
,
max_grad_norm
=
max_grad_norm
)
max_grad_norm
=
max_grad_norm
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
...
@@ -97,7 +97,7 @@ class OpenAIAdam(Optimizer):
...
@@ -97,7 +97,7 @@ class OpenAIAdam(Optimizer):
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
beta1
,
beta2
=
group
[
'b
1'
],
group
[
'b2
'
]
beta1
,
beta2
=
group
[
'b
etas
'
]
state
[
'step'
]
+=
1
state
[
'step'
]
+=
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment