Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c9885903
Commit
c9885903
authored
Jun 25, 2019
by
tonianelope
Browse files
update betas to follow pytorch convention
parent
98dc30b2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
18 deletions
+16
-18
README.md
README.md
+1
-2
pytorch_pretrained_bert/optimization.py
pytorch_pretrained_bert/optimization.py
+8
-9
pytorch_pretrained_bert/optimization_openai.py
pytorch_pretrained_bert/optimization_openai.py
+7
-7
No files found.
README.md
View file @
c9885903
...
@@ -1094,8 +1094,7 @@ The optimizer accepts the following arguments:
...
@@ -1094,8 +1094,7 @@ The optimizer accepts the following arguments:
Can be
`'warmup_linear'`
,
`'warmup_constant'`
,
`'warmup_cosine'`
,
`'none'`
,
`None`
or a
`_LRSchedule`
object (see below).
Can be
`'warmup_linear'`
,
`'warmup_constant'`
,
`'warmup_cosine'`
,
`'none'`
,
`None`
or a
`_LRSchedule`
object (see below).
If
`None`
or
`'none'`
, learning rate is always kept constant.
If
`None`
or
`'none'`
, learning rate is always kept constant.
Default :
`'warmup_linear'`
Default :
`'warmup_linear'`
-
`b1`
: Adams b1. Default :
`0.9`
-
`betas`
: Adams betas. Default :
`0.9, 0.999`
-
`b2`
: Adams b2. Default :
`0.999`
-
`e`
: Adams epsilon. Default :
`1e-6`
-
`e`
: Adams epsilon. Default :
`1e-6`
-
`weight_decay:`
Weight decay. Default :
`0.01`
-
`weight_decay:`
Weight decay. Default :
`0.01`
-
`max_grad_norm`
: Maximum norm for the gradients (
`-1`
means no clipping). Default :
`1.0`
-
`max_grad_norm`
: Maximum norm for the gradients (
`-1`
means no clipping). Default :
`1.0`
...
...
pytorch_pretrained_bert/optimization.py
View file @
c9885903
...
@@ -191,22 +191,21 @@ class BertAdam(Optimizer):
...
@@ -191,22 +191,21 @@ class BertAdam(Optimizer):
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
If `None` or `'none'`, learning rate is always kept constant.
If `None` or `'none'`, learning rate is always kept constant.
Default : `'warmup_linear'`
Default : `'warmup_linear'`
b1: Adams b1. Default: 0.9
betas: Adams betas. Default: (0.9, 0.999)
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
"""
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
b
1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay
=
0.01
,
max_grad_norm
=
1.0
,
**
kwargs
):
b
etas
=
(
0.9
,
0.999
)
,
e
=
1e-6
,
weight_decay
=
0.01
,
max_grad_norm
=
1.0
,
**
kwargs
):
if
lr
is
not
required
and
lr
<
0.0
:
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0.0
<=
b
1
<
1.0
:
if
not
0.0
<=
b
etas
[
0
]
<
1.0
:
raise
ValueError
(
"Invalid b
1
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
1
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 0
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
0
]
))
if
not
0.0
<=
b
2
<
1.0
:
if
not
0.0
<=
b
etas
[
1
]
<
1.0
:
raise
ValueError
(
"Invalid b
2
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
2
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 1
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
1
]
))
if
not
e
>=
0.0
:
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
# initialize schedule object
# initialize schedule object
...
@@ -218,7 +217,7 @@ class BertAdam(Optimizer):
...
@@ -218,7 +217,7 @@ class BertAdam(Optimizer):
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object."
)
"Please specify custom warmup and t_total in _LRSchedule object."
)
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
b
1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay
=
weight_decay
,
b
etas
=
betas
,
e
=
e
,
weight_decay
=
weight_decay
,
max_grad_norm
=
max_grad_norm
)
max_grad_norm
=
max_grad_norm
)
super
(
BertAdam
,
self
).
__init__
(
params
,
defaults
)
super
(
BertAdam
,
self
).
__init__
(
params
,
defaults
)
...
@@ -264,7 +263,7 @@ class BertAdam(Optimizer):
...
@@ -264,7 +263,7 @@ class BertAdam(Optimizer):
state
[
'next_v'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'next_v'
]
=
torch
.
zeros_like
(
p
.
data
)
next_m
,
next_v
=
state
[
'next_m'
],
state
[
'next_v'
]
next_m
,
next_v
=
state
[
'next_m'
],
state
[
'next_v'
]
beta1
,
beta2
=
group
[
'b
1'
],
group
[
'b2
'
]
beta1
,
beta2
=
group
[
'b
etas
'
]
# Add grad clipping
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
if
group
[
'max_grad_norm'
]
>
0
:
...
...
pytorch_pretrained_bert/optimization_openai.py
View file @
c9885903
...
@@ -30,16 +30,16 @@ class OpenAIAdam(Optimizer):
...
@@ -30,16 +30,16 @@ class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
"""
def
__init__
(
self
,
params
,
lr
=
required
,
schedule
=
'warmup_linear'
,
warmup
=-
1
,
t_total
=-
1
,
def
__init__
(
self
,
params
,
lr
=
required
,
schedule
=
'warmup_linear'
,
warmup
=-
1
,
t_total
=-
1
,
b
1
=
0.9
,
b2
=
0.999
,
e
=
1e-8
,
weight_decay
=
0
,
b
etas
=
(
0.9
,
0.999
)
,
e
=
1e-8
,
weight_decay
=
0
,
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
if
lr
is
not
required
and
lr
<
0.0
:
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
if
not
isinstance
(
schedule
,
_LRSchedule
)
and
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0.0
<=
b
1
<
1.0
:
if
not
0.0
<=
b
etas
[
0
]
<
1.0
:
raise
ValueError
(
"Invalid b
1
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
1
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 0
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
0
]
))
if
not
0.0
<=
b
2
<
1.0
:
if
not
0.0
<=
b
etas
[
1
]
<
1.0
:
raise
ValueError
(
"Invalid b
2
parameter: {} - should be in [0.0, 1.0["
.
format
(
b
2
))
raise
ValueError
(
"Invalid b
eta
parameter
at index 1
: {} - should be in [0.0, 1.0["
.
format
(
b
etas
[
1
]
))
if
not
e
>=
0.0
:
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
# initialize schedule object
# initialize schedule object
...
@@ -51,7 +51,7 @@ class OpenAIAdam(Optimizer):
...
@@ -51,7 +51,7 @@ class OpenAIAdam(Optimizer):
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
logger
.
warning
(
"warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
"Please specify custom warmup and t_total in _LRSchedule object."
)
"Please specify custom warmup and t_total in _LRSchedule object."
)
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
b
1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay
=
weight_decay
,
vector_l2
=
vector_l2
,
b
etas
=
betas
,
e
=
e
,
weight_decay
=
weight_decay
,
vector_l2
=
vector_l2
,
max_grad_norm
=
max_grad_norm
)
max_grad_norm
=
max_grad_norm
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
...
@@ -97,7 +97,7 @@ class OpenAIAdam(Optimizer):
...
@@ -97,7 +97,7 @@ class OpenAIAdam(Optimizer):
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
beta1
,
beta2
=
group
[
'b
1'
],
group
[
'b2
'
]
beta1
,
beta2
=
group
[
'b
etas
'
]
state
[
'step'
]
+=
1
state
[
'step'
]
+=
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment