Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
0fdbf1bd
Commit
0fdbf1bd
authored
May 07, 2021
by
Yuexin Wu
Committed by
A. Unique TensorFlower
May 07, 2021
Browse files
Add step offset support for PowerAndLinearDecay.
PiperOrigin-RevId: 372675330
parent
d1c17371
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
129 additions
and
28 deletions
+129
-28
official/modeling/optimization/configs/learning_rate_config.py
...ial/modeling/optimization/configs/learning_rate_config.py
+22
-8
official/modeling/optimization/lr_schedule.py
official/modeling/optimization/lr_schedule.py
+29
-19
official/modeling/optimization/lr_schedule_test.py
official/modeling/optimization/lr_schedule_test.py
+74
-0
official/modeling/optimization/optimizer_factory_test.py
official/modeling/optimization/optimizer_factory_test.py
+4
-1
No files found.
official/modeling/optimization/configs/learning_rate_config.py
View file @
0fdbf1bd
...
@@ -149,21 +149,35 @@ class DirectPowerLrConfig(base_config.Config):
...
@@ -149,21 +149,35 @@ class DirectPowerLrConfig(base_config.Config):
class
PowerAndLinearDecayLrConfig
(
base_config
.
Config
):
class
PowerAndLinearDecayLrConfig
(
base_config
.
Config
):
"""Configuration for DirectPower learning rate decay.
"""Configuration for DirectPower learning rate decay.
This class configures a schedule following follows lr * (step)^power for the
The schedule has the following behavoir.
first total_decay_steps * (1 - linear_decay_fraction) steps, and follows
Let offset_step = step - offset.
lr * (step)^power * (total_decay_steps - step) / (total_decay_steps *
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
linear_decay_fraction) for the rest of the steps.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) < offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step > total_decay_steps, the actual learning rate equals zero.
Attributes:
Attributes:
name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
name: The name of the learning rate schedule. Defaults to
PowerAndLinearDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
total_decay_steps: An int. The total number of steps for power + linear
decay. Defaults to None.
power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
decay.
linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
the learning rate will be multiplied by a linear decay. Defaults to 0.1.
offset: An int. The offset applied to steps. Defaults to 0.
"""
"""
name
:
str
=
'PowerAndLinearDecay'
name
:
str
=
'PowerAndLinearDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
initial_learning_rate
:
Optional
[
float
]
=
None
total_decay_steps
:
Optional
[
int
]
=
None
total_decay_steps
:
Optional
[
int
]
=
None
power
:
float
=
-
0.5
power
:
float
=
-
0.5
linear_decay_fraction
:
float
=
0.1
linear_decay_fraction
:
float
=
0.1
offset
:
int
=
0
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
...
@@ -174,8 +188,8 @@ class PowerDecayWithOffsetLrConfig(base_config.Config):
...
@@ -174,8 +188,8 @@ class PowerDecayWithOffsetLrConfig(base_config.Config):
Otherwise, learning rate equals to lr * (step - offset)^power.
Otherwise, learning rate equals to lr * (step - offset)^power.
Attributes:
Attributes:
name: The name of the learning rate schedule.
name: The name of the learning rate schedule.
Defaults to
Defaults to
PowerDecayWithOffset.
PowerDecayWithOffset.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
power: A float. Defaults to -0.5, for sqrt decay.
offset: An integer. Power decay happens after `offset` steps.
offset: An integer. Power decay happens after `offset` steps.
...
...
official/modeling/optimization/lr_schedule.py
View file @
0fdbf1bd
...
@@ -22,9 +22,11 @@ import tensorflow as tf
...
@@ -22,9 +22,11 @@ import tensorflow as tf
class
LinearWarmup
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
class
LinearWarmup
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Linear warmup schedule."""
"""Linear warmup schedule."""
def
__init__
(
self
,
after_warmup_lr_sched
:
Union
[
def
__init__
(
self
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
,
float
],
after_warmup_lr_sched
:
Union
[
warmup_steps
:
int
,
warmup_learning_rate
:
float
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
,
float
],
warmup_steps
:
int
,
warmup_learning_rate
:
float
,
name
:
Optional
[
str
]
=
None
):
name
:
Optional
[
str
]
=
None
):
"""Add linear warmup schedule to a learning rate schedule.
"""Add linear warmup schedule to a learning rate schedule.
...
@@ -38,8 +40,8 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -38,8 +40,8 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
steps.
steps.
Args:
Args:
after_warmup_lr_sched: tf.keras.optimizers.schedules
after_warmup_lr_sched: tf.keras.optimizers.schedules
.LearningRateSchedule
.LearningRateSchedule
or a constant.
or a constant.
warmup_steps: Number of the warmup steps.
warmup_steps: Number of the warmup steps.
warmup_learning_rate: Initial learning rate for the warmup.
warmup_learning_rate: Initial learning rate for the warmup.
name: Optional, name of warmup schedule.
name: Optional, name of warmup schedule.
...
@@ -53,8 +55,7 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -53,8 +55,7 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
self
.
_final_warmup_lr
=
after_warmup_lr_sched
(
warmup_steps
)
self
.
_final_warmup_lr
=
after_warmup_lr_sched
(
warmup_steps
)
else
:
else
:
self
.
_final_warmup_lr
=
tf
.
cast
(
self
.
_final_warmup_lr
=
tf
.
cast
(
after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
def
__call__
(
self
,
step
:
int
):
def
__call__
(
self
,
step
:
int
):
...
@@ -92,8 +93,7 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -92,8 +93,7 @@ class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
class
PolynomialWarmUp
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
class
PolynomialWarmUp
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Applies polynomial warmup schedule on a given learning rate decay schedule.
"""Applies polynomial warmup schedule on a given learning rate decay schedule."""
"""
def
__init__
(
self
,
def
__init__
(
self
,
after_warmup_lr_sched
:
Union
[
after_warmup_lr_sched
:
Union
[
...
@@ -172,7 +172,7 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -172,7 +172,7 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
Args:
Args:
initial_learning_rate: The initial learning rate.
initial_learning_rate: The initial learning rate.
power: The order of the polynomial.
power: The order of the polynomial.
name: Optional, name of
warmup
schedule.
name: Optional, name of
learning rate
schedule.
"""
"""
super
().
__init__
()
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_initial_learning_rate
=
initial_learning_rate
...
@@ -200,10 +200,16 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -200,10 +200,16 @@ class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
class
PowerAndLinearDecay
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
class
PowerAndLinearDecay
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Learning rate schedule with multiplied by linear decay at the end.
"""Learning rate schedule with multiplied by linear decay at the end.
follows lr * (step)^power for the first total_decay_steps *
The schedule has the following behavoir.
(1 - linear_decay_fraction) steps, and follows lr * (step)^power *
Let offset_step = step - offset.
(total_decay_steps - step) / (total_decay_steps * linear_decay_fraction)
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
for the rest of the steps.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) < offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step > total_decay_steps, the actual learning rate equals zero.
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -211,6 +217,7 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -211,6 +217,7 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
total_decay_steps
:
int
,
total_decay_steps
:
int
,
power
:
float
=
1.0
,
power
:
float
=
1.0
,
linear_decay_fraction
:
float
=
0.1
,
linear_decay_fraction
:
float
=
0.1
,
offset
:
int
=
0
,
name
:
str
=
"PowerAndLinearDecay"
):
name
:
str
=
"PowerAndLinearDecay"
):
"""Initialize configuration of the learning rate schedule.
"""Initialize configuration of the learning rate schedule.
...
@@ -218,20 +225,22 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -218,20 +225,22 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
initial_learning_rate: The initial learning rate.
initial_learning_rate: The initial learning rate.
total_decay_steps: The total number of steps for power + linear decay.
total_decay_steps: The total number of steps for power + linear decay.
power: The order of the polynomial.
power: The order of the polynomial.
linear_decay_fraction: In the last `linear_decay_fraction` steps,
linear_decay_fraction: In the last `linear_decay_fraction` steps, the
the learning rate will be multiplied by a linear decay.
learning rate will be multiplied by a linear decay.
name: Optional, name of warmup schedule.
offset: The offset applied to steps.
name: Optional, name of learning rate schedule.
"""
"""
super
().
__init__
()
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_total_decay_steps
=
total_decay_steps
self
.
_total_decay_steps
=
total_decay_steps
self
.
_power
=
power
self
.
_power
=
power
self
.
_linear_decay_fraction
=
linear_decay_fraction
self
.
_linear_decay_fraction
=
linear_decay_fraction
self
.
_offset
=
offset
self
.
_name
=
name
self
.
_name
=
name
def
__call__
(
self
,
step
):
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
_name
or
"PowerAndLinearDecay"
):
with
tf
.
name_scope
(
self
.
_name
or
"PowerAndLinearDecay"
):
step
=
tf
.
cast
(
step
,
tf
.
float32
)
step
=
tf
.
cast
(
step
-
self
.
_offset
,
tf
.
float32
)
learning_rate
=
self
.
_initial_learning_rate
learning_rate
=
self
.
_initial_learning_rate
# A zero `step` may cause Inf. So make `step` positive.
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero
=
tf
.
math
.
maximum
(
step
,
1.0
)
step_non_zero
=
tf
.
math
.
maximum
(
step
,
1.0
)
...
@@ -250,6 +259,7 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -250,6 +259,7 @@ class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
"total_decay_steps"
:
self
.
_total_decay_steps
,
"total_decay_steps"
:
self
.
_total_decay_steps
,
"power"
:
self
.
_power
,
"power"
:
self
.
_power
,
"linear_decay_fraction"
:
self
.
_linear_decay_fraction
,
"linear_decay_fraction"
:
self
.
_linear_decay_fraction
,
"offset"
:
self
.
_offset
,
"name"
:
self
.
_name
,
"name"
:
self
.
_name
,
}
}
...
@@ -274,7 +284,7 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
...
@@ -274,7 +284,7 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
power: The order of the polynomial.
power: The order of the polynomial.
offset: The offset when computing the power decay.
offset: The offset when computing the power decay.
pre_offset_learning_rate: The maximum learning rate we'll use.
pre_offset_learning_rate: The maximum learning rate we'll use.
name: Optional, name of
warmup
schedule.
name: Optional, name of
learning rate
schedule.
"""
"""
super
().
__init__
()
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_initial_learning_rate
=
initial_learning_rate
...
...
official/modeling/optimization/lr_schedule_test.py
0 → 100644
View file @
0fdbf1bd
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for lr_schedule."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.modeling.optimization
import
lr_schedule
class
PowerAndLinearDecayTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
named_parameters
(
dict
(
testcase_name
=
'power_only'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.0
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
40
,
1.
/
40.
],
[
60
,
1.
/
60
],
[
100
,
1.
/
100
]]),
dict
(
testcase_name
=
'linear_only'
,
init_lr
=
1.0
,
power
=
0.0
,
linear_decay_fraction
=
1.0
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
0.99
],
[
40
,
0.6
],
[
60
,
0.4
],
[
100
,
0.0
]]),
dict
(
testcase_name
=
'general'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.5
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
40
,
1.
/
40.
],
[
60
,
1.
/
60.
*
0.8
],
[
100
,
0.0
]]),
dict
(
testcase_name
=
'offset'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.5
,
total_decay_steps
=
100
,
offset
=
90
,
expected
=
[[
0
,
1.0
],
[
90
,
1.0
],
[
91
,
1.0
],
[
130
,
1.
/
40.
],
[
150
,
1.
/
60.
*
0.8
],
[
190
,
0.0
],
[
200
,
0.0
]]),
)
def
test_power_linear_lr_schedule
(
self
,
init_lr
,
power
,
linear_decay_fraction
,
total_decay_steps
,
offset
,
expected
):
lr
=
lr_schedule
.
PowerAndLinearDecay
(
initial_learning_rate
=
init_lr
,
power
=
power
,
linear_decay_fraction
=
linear_decay_fraction
,
total_decay_steps
=
total_decay_steps
,
offset
=
offset
)
for
step
,
value
in
expected
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/modeling/optimization/optimizer_factory_test.py
View file @
0fdbf1bd
...
@@ -107,6 +107,9 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -107,6 +107,9 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
optimizer_factory
.
OptimizerFactory
(
optimizer_factory
.
OptimizerFactory
(
optimization_config
.
OptimizationConfig
(
params
))
optimization_config
.
OptimizationConfig
(
params
))
# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
def
test_stepwise_lr_schedule
(
self
):
def
test_stepwise_lr_schedule
(
self
):
params
=
{
params
=
{
'optimizer'
:
{
'optimizer'
:
{
...
@@ -352,6 +355,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -352,6 +355,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
'power'
:
-
1.0
,
'power'
:
-
1.0
,
'linear_decay_fraction'
:
0.5
,
'linear_decay_fraction'
:
0.5
,
'total_decay_steps'
:
100
,
'total_decay_steps'
:
100
,
'offset'
:
0
,
}
}
}
}
}
}
...
@@ -390,6 +394,5 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -390,6 +394,5 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
for
step
,
value
in
expected_lr_step_values
:
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment