Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
a32ffa95
Commit
a32ffa95
authored
Feb 03, 2023
by
qianyj
Browse files
update TensorFlow2x test method
parent
e286da17
Changes
268
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3297 additions
and
0 deletions
+3297
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/__init__.py
...master/official/modeling/optimization/configs/__init__.py
+14
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/learning_rate_config.py
...ial/modeling/optimization/configs/learning_rate_config.py
+288
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config.py
...cial/modeling/optimization/configs/optimization_config.py
+118
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config_test.py
...modeling/optimization/configs/optimization_config_test.py
+59
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimizer_config.py
...fficial/modeling/optimization/configs/optimizer_config.py
+268
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/ema_optimizer.py
...ls-master/official/modeling/optimization/ema_optimizer.py
+255
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lars_optimizer.py
...s-master/official/modeling/optimization/lars_optimizer.py
+186
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule.py
...dels-master/official/modeling/optimization/lr_schedule.py
+496
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule_test.py
...master/official/modeling/optimization/lr_schedule_test.py
+109
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory.py
...aster/official/modeling/optimization/optimizer_factory.py
+208
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory_test.py
.../official/modeling/optimization/optimizer_factory_test.py
+443
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/slide_optimizer.py
...-master/official/modeling/optimization/slide_optimizer.py
+20
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/performance.py
...sification/models-master/official/modeling/performance.py
+55
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/tf_utils.py
...lassification/models-master/official/modeling/tf_utils.py
+201
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/README.md
...ision/Classification/models-master/official/nlp/README.md
+59
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/__init__.py
...ion/Classification/models-master/official/nlp/__init__.py
+14
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/README.md
...lassification/models-master/official/nlp/albert/README.md
+335
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/__init__.py
...ssification/models-master/official/nlp/albert/__init__.py
+14
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/configs.py
...assification/models-master/official/nlp/albert/configs.py
+50
-0
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/run_classifier.py
...ation/models-master/official/nlp/albert/run_classifier.py
+105
-0
No files found.
Too many changes to show.
To preserve performance only
268 of 268+
files are displayed.
Plain diff
Email patch
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/__init__.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/learning_rate_config.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for learning rate schedule config."""
from
typing
import
List
,
Optional
import
dataclasses
from
official.modeling.hyperparams
import
base_config
@
dataclasses
.
dataclass
class
ConstantLrConfig
(
base_config
.
Config
):
"""Configuration for constant learning rate.
This class is a containers for the constant learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to Constant.
learning_rate: A float. The learning rate. Defaults to 0.1.
"""
name
:
str
=
'Constant'
learning_rate
:
float
=
0.1
@
dataclasses
.
dataclass
class
StepwiseLrConfig
(
base_config
.
Config
):
"""Configuration for stepwise learning rate decay.
This class is a container for the piecewise constant learning rate scheduling
configs. It will configure an instance of PiecewiseConstantDecay keras
learning rate schedule.
An example (from keras docs): use a learning rate that's 1.0 for the first
100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
```python
boundaries: [100000, 110000]
values: [1.0, 0.5, 0.1]
Attributes:
name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
boundaries: A list of ints of strictly increasing entries. Defaults to None.
values: A list of floats that specifies the values for the intervals defined
by `boundaries`. It should have one more element than `boundaries`.
The learning rate is computed as follows: [0, boundaries[0]] ->
values[0] [boundaries[0], boundaries[1]] -> values[1]
[boundaries[n-1], boundaries[n]] -> values[n] [boundaries[n],
end] -> values[n+1] Defaults to None.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'PiecewiseConstantDecay'
boundaries
:
Optional
[
List
[
int
]]
=
None
values
:
Optional
[
List
[
float
]]
=
None
offset
:
int
=
0
@
dataclasses
.
dataclass
class
ExponentialLrConfig
(
base_config
.
Config
):
"""Configuration for exponential learning rate decay.
This class is a containers for the exponential learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to ExponentialDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
decay_rate: A float. Defaults to None.
staircase: A boolean, if true, learning rate is decreased at discreate
intervals. Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'ExponentialDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
decay_steps
:
Optional
[
int
]
=
None
decay_rate
:
Optional
[
float
]
=
None
staircase
:
Optional
[
bool
]
=
None
offset
:
int
=
0
@
dataclasses
.
dataclass
class
PolynomialLrConfig
(
base_config
.
Config
):
"""Configuration for polynomial learning rate decay.
This class is a containers for the polynomial learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to PolynomialDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
end_learning_rate: A float. The minimal end learning rate.
power: A float. The power of the polynomial. Defaults to linear, 1.0.
cycle: A boolean, whether or not it should cycle beyond decay_steps.
Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'PolynomialDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
decay_steps
:
Optional
[
int
]
=
None
end_learning_rate
:
float
=
0.0001
power
:
float
=
1.0
cycle
:
bool
=
False
offset
:
int
=
0
@
dataclasses
.
dataclass
class
CosineLrConfig
(
base_config
.
Config
):
"""Configuration for Cosine learning rate decay.
This class is a containers for the cosine learning rate decay configs,
tf.keras.experimental.CosineDecay.
Attributes:
name: The name of the learning rate schedule. Defaults to CosineDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
alpha: A float. Minimum learning rate value as a fraction of
initial_learning_rate.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'CosineDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
decay_steps
:
Optional
[
int
]
=
None
alpha
:
float
=
0.0
offset
:
int
=
0
@
dataclasses
.
dataclass
class
DirectPowerLrConfig
(
base_config
.
Config
):
"""Configuration for DirectPower learning rate decay.
This class configures a schedule following follows lr * (step)^power.
Attributes:
name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
"""
name
:
str
=
'DirectPowerDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
power
:
float
=
-
0.5
@
dataclasses
.
dataclass
class
PowerAndLinearDecayLrConfig
(
base_config
.
Config
):
"""Configuration for DirectPower learning rate decay.
The schedule has the following behavoir.
Let offset_step = step - offset.
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step >= total_decay_steps, the actual learning rate equals zero.
Attributes:
name: The name of the learning rate schedule. Defaults to
PowerAndLinearDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
total_decay_steps: An int. The total number of steps for power + linear
decay. Defaults to None.
power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
decay.
linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
the learning rate will be multiplied by a linear decay. Defaults to 0.1.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'PowerAndLinearDecay'
initial_learning_rate
:
Optional
[
float
]
=
None
total_decay_steps
:
Optional
[
int
]
=
None
power
:
float
=
-
0.5
linear_decay_fraction
:
float
=
0.1
offset
:
int
=
0
@
dataclasses
.
dataclass
class
PowerDecayWithOffsetLrConfig
(
base_config
.
Config
):
"""Configuration for power learning rate decay with step offset.
Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
Otherwise, learning rate equals to lr * (step - offset)^power.
Attributes:
name: The name of the learning rate schedule. Defaults to
PowerDecayWithOffset.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
offset: An integer. Power decay happens after `offset` steps.
pre_offset_learning_rate: A float. The constant learning rate before
`offset` steps.
"""
name
:
str
=
'PowerDecayWithOffset'
initial_learning_rate
:
Optional
[
float
]
=
None
power
:
float
=
-
0.5
offset
:
int
=
0
pre_offset_learning_rate
:
float
=
1.0e6
@
dataclasses
.
dataclass
class
StepCosineLrConfig
(
base_config
.
Config
):
"""Configuration for stepwise learning rate decay.
This class is a container for the piecewise cosine learning rate scheduling
configs. It will configure an instance of StepConsineDecayWithOffset keras
learning rate schedule.
```python
boundaries: [100000, 110000]
values: [1.0, 0.5]
lr_decayed_fn = (
lr_schedule.StepConsineDecayWithOffset(
boundaries,
values))
```
from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
Attributes:
name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
boundaries: A list of ints of strictly increasing entries. Defaults to None.
values: A list of floats that specifies the values for the intervals defined
by `boundaries`. It should have one more element than `boundaries`.
The learning rate is computed as follows:
[0, boundaries[0]] -> cosine from values[0] to values[1]
[boundaries[0], boundaries[1]] -> values[1] to values[2]
...
[boundaries[n-1], boundaries[n]] -> values[n] to values[n+1]
[boundaries[n], end] -> values[n+1] to 0.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name
:
str
=
'StepConsineDecayWithOffset'
boundaries
:
Optional
[
List
[
int
]]
=
None
values
:
Optional
[
List
[
float
]]
=
None
offset
:
int
=
0
@
dataclasses
.
dataclass
class
LinearWarmupConfig
(
base_config
.
Config
):
"""Configuration for linear warmup schedule config.
This class is a container for the linear warmup schedule configs.
Warmup_learning_rate is the initial learning rate, the final learning rate of
the warmup period is the learning_rate of the optimizer in use. The learning
rate at each step linearly increased according to the following formula:
warmup_learning_rate = warmup_learning_rate +
step / warmup_steps * (final_learning_rate - warmup_learning_rate).
Using warmup overrides the learning rate schedule by the number of warmup
steps.
Attributes:
name: The name of warmup schedule. Defaults to linear.
warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
warmup_steps: Warmup steps. Defaults to None.
"""
name
:
str
=
'linear'
warmup_learning_rate
:
float
=
0
warmup_steps
:
Optional
[
int
]
=
None
@
dataclasses
.
dataclass
class
PolynomialWarmupConfig
(
base_config
.
Config
):
"""Configuration for linear warmup schedule config.
This class is a container for the polynomial warmup schedule configs.
Attributes:
name: The name of warmup schedule. Defaults to Polynomial.
power: Polynomial power. Defaults to 1.
warmup_steps: Warmup steps. Defaults to None.
"""
name
:
str
=
'polynomial'
power
:
float
=
1
warmup_steps
:
Optional
[
int
]
=
None
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for optimization configs.
This file define the dataclass for optimization configs (OptimizationConfig).
It also has two helper functions get_optimizer_config, and get_lr_config from
an OptimizationConfig class.
"""
from
typing
import
Optional
import
dataclasses
from
official.modeling.hyperparams
import
base_config
from
official.modeling.hyperparams
import
oneof
from
official.modeling.optimization.configs
import
learning_rate_config
as
lr_cfg
from
official.modeling.optimization.configs
import
optimizer_config
as
opt_cfg
@
dataclasses
.
dataclass
class
OptimizerConfig
(
oneof
.
OneOfConfig
):
"""Configuration for optimizer.
Attributes:
type: 'str', type of optimizer to be used, on the of fields below.
sgd: sgd optimizer config.
adam: adam optimizer config.
adamw: adam with weight decay.
lamb: lamb optimizer.
rmsprop: rmsprop optimizer.
lars: lars optimizer.
adagrad: adagrad optimizer.
slide: slide optimizer.
"""
type
:
Optional
[
str
]
=
None
sgd
:
opt_cfg
.
SGDConfig
=
opt_cfg
.
SGDConfig
()
adam
:
opt_cfg
.
AdamConfig
=
opt_cfg
.
AdamConfig
()
adamw
:
opt_cfg
.
AdamWeightDecayConfig
=
opt_cfg
.
AdamWeightDecayConfig
()
lamb
:
opt_cfg
.
LAMBConfig
=
opt_cfg
.
LAMBConfig
()
rmsprop
:
opt_cfg
.
RMSPropConfig
=
opt_cfg
.
RMSPropConfig
()
lars
:
opt_cfg
.
LARSConfig
=
opt_cfg
.
LARSConfig
()
adagrad
:
opt_cfg
.
AdagradConfig
=
opt_cfg
.
AdagradConfig
()
slide
:
opt_cfg
.
SLIDEConfig
=
opt_cfg
.
SLIDEConfig
()
adafactor
:
opt_cfg
.
AdafactorConfig
=
opt_cfg
.
AdafactorConfig
()
@
dataclasses
.
dataclass
class
LrConfig
(
oneof
.
OneOfConfig
):
"""Configuration for lr schedule.
Attributes:
type: 'str', type of lr schedule to be used, one of the fields below.
constant: constant learning rate config.
stepwise: stepwise learning rate config.
exponential: exponential learning rate config.
polynomial: polynomial learning rate config.
cosine: cosine learning rate config.
power: step^power learning rate config.
power_linear: learning rate config of step^power followed by
step^power*linear.
power_with_offset: power decay with a step offset.
step_cosine_with_offset: Step cosine with a step offset.
"""
type
:
Optional
[
str
]
=
None
constant
:
lr_cfg
.
ConstantLrConfig
=
lr_cfg
.
ConstantLrConfig
()
stepwise
:
lr_cfg
.
StepwiseLrConfig
=
lr_cfg
.
StepwiseLrConfig
()
exponential
:
lr_cfg
.
ExponentialLrConfig
=
lr_cfg
.
ExponentialLrConfig
()
polynomial
:
lr_cfg
.
PolynomialLrConfig
=
lr_cfg
.
PolynomialLrConfig
()
cosine
:
lr_cfg
.
CosineLrConfig
=
lr_cfg
.
CosineLrConfig
()
power
:
lr_cfg
.
DirectPowerLrConfig
=
lr_cfg
.
DirectPowerLrConfig
()
power_linear
:
lr_cfg
.
PowerAndLinearDecayLrConfig
=
(
lr_cfg
.
PowerAndLinearDecayLrConfig
())
power_with_offset
:
lr_cfg
.
PowerDecayWithOffsetLrConfig
=
(
lr_cfg
.
PowerDecayWithOffsetLrConfig
())
step_cosine_with_offset
:
lr_cfg
.
StepCosineLrConfig
=
(
lr_cfg
.
StepCosineLrConfig
())
@
dataclasses
.
dataclass
class
WarmupConfig
(
oneof
.
OneOfConfig
):
"""Configuration for lr schedule.
Attributes:
type: 'str', type of warmup schedule to be used, one of the fields below.
linear: linear warmup config.
polynomial: polynomial warmup config.
"""
type
:
Optional
[
str
]
=
None
linear
:
lr_cfg
.
LinearWarmupConfig
=
lr_cfg
.
LinearWarmupConfig
()
polynomial
:
lr_cfg
.
PolynomialWarmupConfig
=
lr_cfg
.
PolynomialWarmupConfig
()
@
dataclasses
.
dataclass
class
OptimizationConfig
(
base_config
.
Config
):
"""Configuration for optimizer and learning rate schedule.
Attributes:
optimizer: optimizer oneof config.
ema: optional exponential moving average optimizer config, if specified, ema
optimizer will be used.
learning_rate: learning rate oneof config.
warmup: warmup oneof config.
"""
optimizer
:
OptimizerConfig
=
OptimizerConfig
()
ema
:
Optional
[
opt_cfg
.
EMAConfig
]
=
None
learning_rate
:
LrConfig
=
LrConfig
()
warmup
:
WarmupConfig
=
WarmupConfig
()
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimization_config_test.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for optimization_config.py."""
import
tensorflow
as
tf
from
official.modeling.optimization.configs
import
learning_rate_config
as
lr_cfg
from
official.modeling.optimization.configs
import
optimization_config
from
official.modeling.optimization.configs
import
optimizer_config
as
opt_cfg
class
OptimizerConfigTest
(
tf
.
test
.
TestCase
):
def
test_no_optimizer
(
self
):
optimizer
=
optimization_config
.
OptimizationConfig
({}).
optimizer
.
get
()
self
.
assertIsNone
(
optimizer
)
def
test_no_lr_schedule
(
self
):
lr
=
optimization_config
.
OptimizationConfig
({}).
learning_rate
.
get
()
self
.
assertIsNone
(
lr
)
def
test_no_warmup_schedule
(
self
):
warmup
=
optimization_config
.
OptimizationConfig
({}).
warmup
.
get
()
self
.
assertIsNone
(
warmup
)
def
test_config
(
self
):
opt_config
=
optimization_config
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{}
# default config
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{}
},
'warmup'
:
{
'type'
:
'linear'
}
})
self
.
assertEqual
(
opt_config
.
optimizer
.
get
(),
opt_cfg
.
SGDConfig
())
self
.
assertEqual
(
opt_config
.
learning_rate
.
get
(),
lr_cfg
.
PolynomialLrConfig
())
self
.
assertEqual
(
opt_config
.
warmup
.
get
(),
lr_cfg
.
LinearWarmupConfig
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/configs/optimizer_config.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for optimizer configs."""
from
typing
import
List
,
Optional
import
dataclasses
from
official.modeling.hyperparams
import
base_config
@
dataclasses
.
dataclass
class
BaseOptimizerConfig
(
base_config
.
Config
):
"""Base optimizer config.
Attributes:
clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
their L2 norm exceeds this value.
clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
their absolute value exceeds this value.
global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
clipped so that their global norm is no higher than this value
"""
clipnorm
:
Optional
[
float
]
=
None
clipvalue
:
Optional
[
float
]
=
None
global_clipnorm
:
Optional
[
float
]
=
None
@
dataclasses
.
dataclass
class
SGDConfig
(
BaseOptimizerConfig
):
"""Configuration for SGD optimizer.
The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
Attributes:
name: name of the optimizer.
decay: decay rate for SGD optimizer.
nesterov: nesterov for SGD optimizer.
momentum: momentum for SGD optimizer.
"""
name
:
str
=
"SGD"
decay
:
float
=
0.0
nesterov
:
bool
=
False
momentum
:
float
=
0.0
@
dataclasses
.
dataclass
class
RMSPropConfig
(
BaseOptimizerConfig
):
"""Configuration for RMSProp optimizer.
The attributes for this class matches the arguments of
tf.keras.optimizers.RMSprop.
Attributes:
name: name of the optimizer.
rho: discounting factor for RMSprop optimizer.
momentum: momentum for RMSprop optimizer.
epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
centered: Whether to normalize gradients or not.
"""
name
:
str
=
"RMSprop"
rho
:
float
=
0.9
momentum
:
float
=
0.0
epsilon
:
float
=
1e-7
centered
:
bool
=
False
@
dataclasses
.
dataclass
class
AdagradConfig
(
BaseOptimizerConfig
):
"""Configuration for Adagrad optimizer.
The attributes of this class match the arguments of
tf.keras.optimizer.Adagrad.
Attributes:
name: name of the optimizer.
initial_accumulator_value: A floating point value. Starting value for the
accumulators, must be non-negative.
epsilon: A small floating point value to avoid zero denominator.
"""
name
:
str
=
"Adagrad"
initial_accumulator_value
:
float
=
0.1
epsilon
:
float
=
1e-07
@
dataclasses
.
dataclass
class
AdamConfig
(
BaseOptimizerConfig
):
"""Configuration for Adam optimizer.
The attributes for this class matches the arguments of
tf.keras.optimizer.Adam.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in Adam optimizer.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
"""
name
:
str
=
"Adam"
beta_1
:
float
=
0.9
beta_2
:
float
=
0.999
epsilon
:
float
=
1e-07
amsgrad
:
bool
=
False
@
dataclasses
.
dataclass
class
AdamWeightDecayConfig
(
BaseOptimizerConfig
):
"""Configuration for Adam optimizer with weight decay.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in the optimizer.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
weight_decay_rate: float. Weight decay rate. Default to 0.
include_in_weight_decay: list[str], or None. List of weight names to include
in weight decay.
exclude_from_weight_decay: list[str], or None. List of weight names to not
include in weight decay.
gradient_clip_norm: A positive float. Clips the gradients to this maximum
L2-norm. Default to 1.0.
"""
name
:
str
=
"AdamWeightDecay"
beta_1
:
float
=
0.9
beta_2
:
float
=
0.999
epsilon
:
float
=
1e-07
amsgrad
:
bool
=
False
weight_decay_rate
:
float
=
0.0
include_in_weight_decay
:
Optional
[
List
[
str
]]
=
None
exclude_from_weight_decay
:
Optional
[
List
[
str
]]
=
None
gradient_clip_norm
:
float
=
1.0
@
dataclasses
.
dataclass
class
LAMBConfig
(
BaseOptimizerConfig
):
"""Configuration for LAMB optimizer.
The attributes for this class matches the arguments of
tensorflow_addons.optimizers.LAMB.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in LAMB optimizer.
weight_decay_rate: float. Weight decay rate. Default to 0.
exclude_from_weight_decay: List of regex patterns of variables excluded from
weight decay. Variables whose name contain a substring matching the
pattern will be excluded.
exclude_from_layer_adaptation: List of regex patterns of variables excluded
from layer adaptation. Variables whose name contain a substring matching
the pattern will be excluded.
"""
name
:
str
=
"LAMB"
beta_1
:
float
=
0.9
beta_2
:
float
=
0.999
epsilon
:
float
=
1e-6
weight_decay_rate
:
float
=
0.0
exclude_from_weight_decay
:
Optional
[
List
[
str
]]
=
None
exclude_from_layer_adaptation
:
Optional
[
List
[
str
]]
=
None
@
dataclasses
.
dataclass
class
EMAConfig
(
BaseOptimizerConfig
):
"""Exponential moving average optimizer config.
Attributes:
name: 'str', name of the optimizer.
trainable_weights_only: 'bool', if True, only model trainable weights will
be updated. Otherwise, all model weights will be updated. This mainly
affects batch normalization parameters.
average_decay: 'float', average decay value.
start_step: 'int', start step to apply moving average.
dynamic_decay: 'bool', whether to apply dynamic decay or not.
"""
name
:
str
=
"ExponentialMovingAverage"
trainable_weights_only
:
bool
=
True
average_decay
:
float
=
0.99
start_step
:
int
=
0
dynamic_decay
:
bool
=
True
@
dataclasses
.
dataclass
class
LARSConfig
(
BaseOptimizerConfig
):
"""Layer-wise adaptive rate scaling config.
Attributes:
name: 'str', name of the optimizer.
momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
the relevant direction and dampens oscillations. Defaults to 0.9.
eeta: `float` LARS coefficient as used in the paper. Default set to LARS
coefficient from the paper. (eeta / weight_decay) determines the highest
scaling factor in LARS..
weight_decay_rate: `float` for weight decay.
nesterov: 'boolean' for whether to use nesterov momentum.
classic_momentum: `boolean` for whether to use classic (or popular)
momentum. The learning rate is applied during momentum update in classic
momentum, but after momentum for popular momentum.
exclude_from_weight_decay: A list of `string` for variable screening, if any
of the string appears in a variable's name, the variable will be excluded
for computing weight decay. For example, one could specify the list like
['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
layer adaptation. If it is None, it will be defaulted the same as
exclude_from_weight_decay.
"""
name
:
str
=
"LARS"
momentum
:
float
=
0.9
eeta
:
float
=
0.001
weight_decay_rate
:
float
=
0.0
nesterov
:
bool
=
False
classic_momentum
:
bool
=
True
exclude_from_weight_decay
:
Optional
[
List
[
str
]]
=
None
exclude_from_layer_adaptation
:
Optional
[
List
[
str
]]
=
None
@
dataclasses
.
dataclass
class
SLIDEConfig
(
BaseOptimizerConfig
):
"""Configuration for SLIDE optimizer.
Details coming soon.
"""
name
:
str
=
"SLIDE"
beta_1
:
float
=
0.9
beta_2
:
float
=
0.999
epsilon
:
float
=
1e-6
weight_decay_rate
:
float
=
0.0
weight_decay_type
:
str
=
"inner"
exclude_from_weight_decay
:
Optional
[
List
[
str
]]
=
None
exclude_from_layer_adaptation
:
Optional
[
List
[
str
]]
=
None
include_in_sparse_layer_adaptation
:
Optional
[
List
[
str
]]
=
None
sparse_layer_learning_rate
:
float
=
0.1
do_gradient_rescaling
:
bool
=
True
norm_type
:
str
=
"layer"
ratio_clip_norm
:
float
=
1e5
@
dataclasses
.
dataclass
class
AdafactorConfig
(
BaseOptimizerConfig
):
"""Configuration for Adafactor optimizer.
The attributes for this class matches the arguments of the Adafactor
implementation.
"""
name
:
str
=
"Adafactor"
factored
:
bool
=
True
multiply_by_parameter_scale
:
bool
=
True
beta1
:
Optional
[
float
]
=
None
decay_rate
:
float
=
0.8
step_offset
:
int
=
0
clipping_threshold
:
float
=
1.0
min_dim_size_to_factor
:
int
=
128
epsilon1
:
float
=
1e-30
epsilon2
:
float
=
1e-3
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/ema_optimizer.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exponential moving average optimizer."""
from
typing
import
List
,
Optional
,
Text
import
tensorflow
as
tf
# pylint: disable=protected-access
class
ExponentialMovingAverage
(
tf
.
keras
.
optimizers
.
Optimizer
):
"""Optimizer that computes an exponential moving average of the variables.
Empirically it has been found that using the moving average of the trained
parameters of a deep network is better than using its trained parameters
directly. This optimizer allows you to compute this moving average and swap
the variables at save time so that any code outside of the training loop
will use by default the average values instead of the original ones.
Example of usage for training:
```python
opt = tf.keras.optimizers.SGD(learning_rate)
opt = ExponentialMovingAverage(opt)
opt.shadow_copy(model)
```
At test time, swap the shadow variables to evaluate on the averaged weights:
```python
opt.swap_weights()
# Test eval the model here
opt.swap_weights()
```
"""
def
__init__
(
self
,
optimizer
:
tf
.
keras
.
optimizers
.
Optimizer
,
trainable_weights_only
:
bool
=
True
,
average_decay
:
float
=
0.99
,
start_step
:
int
=
0
,
dynamic_decay
:
bool
=
True
,
name
:
Text
=
'ExponentialMovingAverage'
,
**
kwargs
):
"""Construct a new ExponentialMovingAverage optimizer.
Args:
optimizer: `tf.keras.optimizers.Optimizer` that will be
used to compute and apply gradients.
trainable_weights_only: 'bool', if True, only model trainable weights will
be updated. Otherwise, all model weights will be updated. This mainly
affects batch normalization parameters.
average_decay: float. Decay to use to maintain the moving averages
of trained variables.
start_step: int. What step to start the moving average.
dynamic_decay: bool. Whether to change the decay based on the number
of optimizer updates. Decay will start at 0.1 and gradually increase
up to `average_decay` after each optimizer update. This behavior is
similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
name: Optional name for the operations created when applying
gradients. Defaults to "moving_average".
**kwargs: keyword arguments. Allowed to be {`clipnorm`,
`clipvalue`, `lr`, `decay`}.
"""
super
().
__init__
(
name
,
**
kwargs
)
self
.
_average_decay
=
average_decay
self
.
_trainable_weights_only
=
trainable_weights_only
self
.
_start_step
=
tf
.
constant
(
start_step
,
tf
.
float32
)
self
.
_dynamic_decay
=
dynamic_decay
self
.
_optimizer
=
optimizer
self
.
_track_trackable
(
self
.
_optimizer
,
'base_optimizer'
)
self
.
_average_weights
=
None
self
.
_model_weights
=
None
def
shadow_copy
(
self
,
model
:
tf
.
keras
.
Model
):
"""Creates shadow variables for the given model weights."""
if
self
.
_trainable_weights_only
:
self
.
_model_weights
=
model
.
trainable_variables
else
:
self
.
_model_weights
=
model
.
variables
for
var
in
self
.
_model_weights
:
self
.
add_slot
(
var
,
'average'
,
initializer
=
'zeros'
)
self
.
_average_weights
=
[
self
.
get_slot
(
var
,
'average'
)
for
var
in
self
.
_model_weights
]
@
property
def
has_shadow_copy
(
self
):
"""Whether this optimizer has created shadow variables."""
return
self
.
_model_weights
is
not
None
and
self
.
_average_weights
is
not
None
def
_create_slots
(
self
,
var_list
):
self
.
_optimizer
.
_create_slots
(
var_list
=
var_list
)
# pylint: disable=protected-access
def
apply_gradients
(
self
,
grads_and_vars
,
name
:
Optional
[
Text
]
=
None
):
result
=
self
.
_optimizer
.
apply_gradients
(
grads_and_vars
,
name
)
self
.
update_average
(
self
.
iterations
)
return
result
@
tf
.
function
def
update_average
(
self
,
step
:
tf
.
Tensor
):
step
=
tf
.
cast
(
step
,
tf
.
float32
)
if
step
<
self
.
_start_step
:
decay
=
tf
.
constant
(
0.
,
tf
.
float32
)
elif
self
.
_dynamic_decay
:
decay
=
step
-
self
.
_start_step
decay
=
tf
.
minimum
(
self
.
_average_decay
,
(
1.
+
decay
)
/
(
10.
+
decay
))
else
:
decay
=
self
.
_average_decay
def
_apply_moving
(
v_moving
,
v_normal
):
diff
=
v_moving
-
v_normal
v_moving
.
assign_sub
(
tf
.
cast
(
1.
-
decay
,
v_moving
.
dtype
)
*
diff
)
return
v_moving
def
_update
(
strategy
,
v_moving_and_v_normal
):
for
v_moving
,
v_normal
in
v_moving_and_v_normal
:
strategy
.
extended
.
update
(
v_moving
,
_apply_moving
,
args
=
(
v_normal
,))
ctx
=
tf
.
distribute
.
get_replica_context
()
return
ctx
.
merge_call
(
_update
,
args
=
(
zip
(
self
.
_average_weights
,
self
.
_model_weights
),))
def
swap_weights
(
self
):
"""Swap the average and moving weights.
This is a convenience method to allow one to evaluate the averaged weights
at test time. Loads the weights stored in `self._average` into the model,
keeping a copy of the original model weights. Swapping twice will return
the original weights.
"""
if
tf
.
distribute
.
in_cross_replica_context
():
strategy
=
tf
.
distribute
.
get_strategy
()
strategy
.
run
(
self
.
_swap_weights
,
args
=
())
else
:
raise
ValueError
(
'Swapping weights must occur under a '
'tf.distribute.Strategy'
)
@
tf
.
function
def
_swap_weights
(
self
):
def
fn_0
(
a
,
b
):
a
.
assign_add
(
b
)
return
a
def
fn_1
(
b
,
a
):
b
.
assign
(
a
-
b
)
return
b
def
fn_2
(
a
,
b
):
a
.
assign_sub
(
b
)
return
a
def
swap
(
strategy
,
a_and_b
):
"""Swap `a` and `b` and mirror to all devices."""
for
a
,
b
in
a_and_b
:
strategy
.
extended
.
update
(
a
,
fn_0
,
args
=
(
b
,))
# a = a + b
strategy
.
extended
.
update
(
b
,
fn_1
,
args
=
(
a
,))
# b = a - b
strategy
.
extended
.
update
(
a
,
fn_2
,
args
=
(
b
,))
# a = a - b
ctx
=
tf
.
distribute
.
get_replica_context
()
return
ctx
.
merge_call
(
swap
,
args
=
(
zip
(
self
.
_average_weights
,
self
.
_model_weights
),))
def
assign_average_vars
(
self
,
var_list
:
List
[
tf
.
Variable
]):
"""Assign variables in var_list with their respective averages.
Args:
var_list: List of model variables to be assigned to their average.
Returns:
assign_op: The op corresponding to the assignment operation of
variables to their average.
"""
assign_op
=
tf
.
group
([
var
.
assign
(
self
.
get_slot
(
var
,
'average'
))
for
var
in
var_list
if
var
.
trainable
])
return
assign_op
def
_create_hypers
(
self
):
self
.
_optimizer
.
_create_hypers
()
# pylint: disable=protected-access
def
_prepare
(
self
,
var_list
):
return
self
.
_optimizer
.
_prepare
(
var_list
=
var_list
)
# pylint: disable=protected-access
@
property
def
iterations
(
self
):
return
self
.
_optimizer
.
iterations
@
iterations
.
setter
def
iterations
(
self
,
variable
):
self
.
_optimizer
.
iterations
=
variable
@
property
def
weights
(
self
):
# return self._weights + self._optimizer.weights
return
self
.
_optimizer
.
weights
def
variables
(
self
):
return
self
.
_weights
+
[
self
.
iterations
]
@
property
def
lr
(
self
):
return
self
.
_optimizer
.
_get_hyper
(
'learning_rate'
)
@
lr
.
setter
def
lr
(
self
,
lr
):
self
.
_optimizer
.
_set_hyper
(
'learning_rate'
,
lr
)
@
property
def
learning_rate
(
self
):
return
self
.
_optimizer
.
_get_hyper
(
'learning_rate'
)
@
learning_rate
.
setter
def
learning_rate
(
self
,
learning_rate
):
# pylint: disable=redefined-outer-name
self
.
_optimizer
.
_set_hyper
(
'learning_rate'
,
learning_rate
)
def
_resource_apply_dense
(
self
,
grad
,
var
):
return
self
.
_optimizer
.
_resource_apply_dense
(
grad
,
var
)
def
_resource_apply_sparse
(
self
,
grad
,
var
,
indices
):
return
self
.
_optimizer
.
_resource_apply_sparse
(
grad
,
var
,
indices
)
def
_resource_apply_sparse_duplicate_indices
(
self
,
grad
,
var
,
indices
):
return
self
.
_optimizer
.
_resource_apply_sparse_duplicate_indices
(
grad
,
var
,
indices
)
def
get_config
(
self
):
config
=
{
'optimizer'
:
tf
.
keras
.
optimizers
.
serialize
(
self
.
_optimizer
),
'average_decay'
:
self
.
_average_decay
,
'start_step'
:
self
.
_start_step
,
'dynamic_decay'
:
self
.
_dynamic_decay
,
}
base_config
=
super
(
ExponentialMovingAverage
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
optimizer
=
tf
.
keras
.
optimizers
.
deserialize
(
config
.
pop
(
'optimizer'
),
custom_objects
=
custom_objects
,
)
return
cls
(
optimizer
,
**
config
)
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lars_optimizer.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Layer-wise adaptive rate scaling optimizer."""
import
re
from
typing
import
Text
,
List
,
Optional
import
tensorflow
as
tf
# pylint: disable=protected-access
class
LARS
(
tf
.
keras
.
optimizers
.
Optimizer
):
"""Layer-wise Adaptive Rate Scaling for large batch training.
Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
"""
def
__init__
(
self
,
learning_rate
:
float
=
0.01
,
momentum
:
float
=
0.9
,
weight_decay_rate
:
float
=
0.0
,
eeta
:
float
=
0.001
,
nesterov
:
bool
=
False
,
classic_momentum
:
bool
=
True
,
exclude_from_weight_decay
:
Optional
[
List
[
Text
]]
=
None
,
exclude_from_layer_adaptation
:
Optional
[
List
[
Text
]]
=
None
,
name
:
Text
=
"LARS"
,
**
kwargs
):
"""Constructs a LARSOptimizer.
Args:
learning_rate: `float` for learning rate. Defaults to 0.01.
momentum: `float` hyperparameter >= 0 that accelerates gradient descent
in the relevant direction and dampens oscillations. Defaults to 0.9.
weight_decay_rate: `float` for weight decay.
eeta: `float` LARS coefficient as used in the paper. Default set to LARS
coefficient from the paper. (eeta / weight_decay) determines the
highest scaling factor in LARS..
nesterov: 'boolean' for whether to use nesterov momentum.
classic_momentum: `boolean` for whether to use classic (or popular)
momentum. The learning rate is applied during momentum update in
classic momentum, but after momentum for popular momentum.
exclude_from_weight_decay: A list of `string` for variable screening, if
any of the string appears in a variable's name, the variable will be
excluded for computing weight decay. For example, one could specify
the list like ['batch_normalization', 'bias'] to exclude BN and bias
from weight decay.
exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
for layer adaptation. If it is None, it will be defaulted the same as
exclude_from_weight_decay.
name: `Text` as optional name for the operations created when applying
gradients. Defaults to "LARS".
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
gradients by value, `decay` is included for backward compatibility to
allow time inverse decay of learning rate. `lr` is included for
backward compatibility, recommended to use `learning_rate` instead.
"""
super
(
LARS
,
self
).
__init__
(
name
,
**
kwargs
)
self
.
_set_hyper
(
"learning_rate"
,
learning_rate
)
self
.
_set_hyper
(
"decay"
,
self
.
_initial_decay
)
self
.
momentum
=
momentum
self
.
weight_decay_rate
=
weight_decay_rate
self
.
eeta
=
eeta
self
.
nesterov
=
nesterov
self
.
classic_momentum
=
classic_momentum
self
.
exclude_from_weight_decay
=
exclude_from_weight_decay
# exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
# arg is None.
if
exclude_from_layer_adaptation
:
self
.
exclude_from_layer_adaptation
=
exclude_from_layer_adaptation
else
:
self
.
exclude_from_layer_adaptation
=
exclude_from_weight_decay
def
_create_slots
(
self
,
var_list
):
for
v
in
var_list
:
self
.
add_slot
(
v
,
"momentum"
)
def
_resource_apply_dense
(
self
,
grad
,
param
,
apply_state
=
None
):
if
grad
is
None
or
param
is
None
:
return
tf
.
no_op
()
var_device
,
var_dtype
=
param
.
device
,
param
.
dtype
.
base_dtype
coefficients
=
((
apply_state
or
{}).
get
((
var_device
,
var_dtype
))
or
self
.
_fallback_apply_state
(
var_device
,
var_dtype
))
learning_rate
=
coefficients
[
"lr_t"
]
param_name
=
param
.
name
v
=
self
.
get_slot
(
param
,
"momentum"
)
if
self
.
_use_weight_decay
(
param_name
):
grad
+=
self
.
weight_decay_rate
*
param
if
self
.
classic_momentum
:
trust_ratio
=
1.0
if
self
.
_do_layer_adaptation
(
param_name
):
w_norm
=
tf
.
norm
(
param
,
ord
=
2
)
g_norm
=
tf
.
norm
(
grad
,
ord
=
2
)
trust_ratio
=
tf
.
where
(
tf
.
greater
(
w_norm
,
0
),
tf
.
where
(
tf
.
greater
(
g_norm
,
0
),
(
self
.
eeta
*
w_norm
/
g_norm
),
1.0
),
1.0
)
scaled_lr
=
learning_rate
*
trust_ratio
next_v
=
tf
.
multiply
(
self
.
momentum
,
v
)
+
scaled_lr
*
grad
if
self
.
nesterov
:
update
=
tf
.
multiply
(
self
.
momentum
,
next_v
)
+
scaled_lr
*
grad
else
:
update
=
next_v
next_param
=
param
-
update
else
:
next_v
=
tf
.
multiply
(
self
.
momentum
,
v
)
+
grad
if
self
.
nesterov
:
update
=
tf
.
multiply
(
self
.
momentum
,
next_v
)
+
grad
else
:
update
=
next_v
trust_ratio
=
1.0
if
self
.
_do_layer_adaptation
(
param_name
):
w_norm
=
tf
.
norm
(
param
,
ord
=
2
)
v_norm
=
tf
.
norm
(
update
,
ord
=
2
)
trust_ratio
=
tf
.
where
(
tf
.
greater
(
w_norm
,
0
),
tf
.
where
(
tf
.
greater
(
v_norm
,
0
),
(
self
.
eeta
*
w_norm
/
v_norm
),
1.0
),
1.0
)
scaled_lr
=
trust_ratio
*
learning_rate
next_param
=
param
-
scaled_lr
*
update
return
tf
.
group
(
*
[
param
.
assign
(
next_param
,
use_locking
=
False
),
v
.
assign
(
next_v
,
use_locking
=
False
)
])
def
_resource_apply_sparse
(
self
,
grad
,
handle
,
indices
,
apply_state
):
raise
NotImplementedError
(
"Applying sparse gradients is not implemented."
)
def
_use_weight_decay
(
self
,
param_name
):
"""Whether to use L2 weight decay for `param_name`."""
if
not
self
.
weight_decay_rate
:
return
False
if
self
.
exclude_from_weight_decay
:
for
r
in
self
.
exclude_from_weight_decay
:
if
re
.
search
(
r
,
param_name
)
is
not
None
:
return
False
return
True
def
_do_layer_adaptation
(
self
,
param_name
):
"""Whether to do layer-wise learning rate adaptation for `param_name`."""
if
self
.
exclude_from_layer_adaptation
:
for
r
in
self
.
exclude_from_layer_adaptation
:
if
re
.
search
(
r
,
param_name
)
is
not
None
:
return
False
return
True
def
get_config
(
self
):
config
=
super
(
LARS
,
self
).
get_config
()
config
.
update
({
"learning_rate"
:
self
.
_serialize_hyperparameter
(
"learning_rate"
),
"decay"
:
self
.
_serialize_hyperparameter
(
"decay"
),
"momentum"
:
self
.
momentum
,
"classic_momentum"
:
self
.
classic_momentum
,
"weight_decay_rate"
:
self
.
weight_decay_rate
,
"eeta"
:
self
.
eeta
,
"nesterov"
:
self
.
nesterov
,
})
return
config
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Learning rate schedule classes."""
import
math
from
typing
import
Mapping
,
Any
,
Union
,
Optional
import
tensorflow
as
tf
def
_make_offset_wrapper
(
new_class_name
:
str
,
base_lr_class
):
"""Generates a offset wrapper of learning rate schedule.
It will returns a subclass of the the `base_lr_class`, the subclass takes an
`offset` argument in the constructor. When the new class instance is called,
the behavior is:
new_class_object(step) = base_lr_class_object(step - offset)
Example:
CosineDecayWithOffset = _make_offset_wrapper(
'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
# Use the lr:
lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
decay_steps=1000)
lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
Args:
new_class_name: the name of the new class.
base_lr_class: the base learning rate schedule class. Should be subclass of
tf.keras.optimizers.schedules.LearningRateSchedule
Returns:
A new class (subclass of the base_lr_class) that can take an offset.
"""
assert
issubclass
(
base_lr_class
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
),
(
"base_lr_class should be subclass of keras "
f
"LearningRateSchedule, got
{
base_lr_class
}
"
)
# pylint: disable=protected-access,pointless-statement
def
offset_learning_rate_init
(
self
,
offset
=
0
,
**
kwargs
):
"""Construct learning rate schedule object.
When this object is called, its behavior is
self.__call__(step) == base_lr_class.__call__(step - offset)
Args:
self: this object.
offset: The offset when computing the learning rate schedule.
**kwargs: Pass through to base learning rate class constructor.
"""
base_lr_class
.
__init__
(
self
,
**
kwargs
)
self
.
_offset
=
offset
def
offset_learning_rate_call
(
self
,
step
):
step
=
tf
.
cast
(
step
-
self
.
_offset
,
tf
.
float32
)
return
base_lr_class
.
__call__
(
self
,
step
)
# pylint: enable=protected-access,pointless-statement
return
type
(
new_class_name
,
(
base_lr_class
,),
{
"base_lr_class"
:
base_lr_class
,
"__init__"
:
offset_learning_rate_init
,
"__call__"
:
offset_learning_rate_call
})
PiecewiseConstantDecayWithOffset
=
_make_offset_wrapper
(
"PiecewiseConstantDecayWithOffset"
,
tf
.
keras
.
optimizers
.
schedules
.
PiecewiseConstantDecay
)
PolynomialDecayWithOffset
=
_make_offset_wrapper
(
"PolynomialDecayWithOffset"
,
tf
.
keras
.
optimizers
.
schedules
.
PolynomialDecay
)
ExponentialDecayWithOffset
=
_make_offset_wrapper
(
"ExponentialDecayWithOffset"
,
tf
.
keras
.
optimizers
.
schedules
.
ExponentialDecay
)
CosineDecayWithOffset
=
_make_offset_wrapper
(
"CosineDecayWithOffset"
,
tf
.
keras
.
experimental
.
CosineDecay
)
class
LinearWarmup
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Linear warmup schedule."""
def
__init__
(
self
,
after_warmup_lr_sched
:
Union
[
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
,
float
],
warmup_steps
:
int
,
warmup_learning_rate
:
float
,
name
:
Optional
[
str
]
=
None
):
"""Add linear warmup schedule to a learning rate schedule.
warmup_lr is the initial learning rate, the final learning rate of the
init_warmup period is the initial learning rate of lr_schedule in use.
The learning rate at each step linearly increased according to the following
formula:
learning_rate = warmup_lr + step / warmup_steps
* (final_warmup_lr - warmup_lr).
Using warmup overrides the learning rate schedule by the number of warmup
steps.
Args:
after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
or a constant.
warmup_steps: Number of the warmup steps.
warmup_learning_rate: Initial learning rate for the warmup.
name: Optional, name of warmup schedule.
"""
super
().
__init__
()
self
.
_name
=
name
self
.
_after_warmup_lr_sched
=
after_warmup_lr_sched
self
.
_warmup_steps
=
warmup_steps
self
.
_init_warmup_lr
=
warmup_learning_rate
if
isinstance
(
after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
self
.
_final_warmup_lr
=
after_warmup_lr_sched
(
warmup_steps
)
else
:
self
.
_final_warmup_lr
=
tf
.
cast
(
after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
def
__call__
(
self
,
step
:
int
):
global_step
=
tf
.
cast
(
step
,
dtype
=
tf
.
float32
)
linear_warmup_lr
=
(
self
.
_init_warmup_lr
+
global_step
/
self
.
_warmup_steps
*
(
self
.
_final_warmup_lr
-
self
.
_init_warmup_lr
))
if
isinstance
(
self
.
_after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
after_warmup_lr
=
self
.
_after_warmup_lr_sched
(
step
)
else
:
after_warmup_lr
=
tf
.
cast
(
self
.
_after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
lr
=
tf
.
cond
(
global_step
<
self
.
_warmup_steps
,
lambda
:
linear_warmup_lr
,
lambda
:
after_warmup_lr
)
return
lr
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
if
isinstance
(
self
.
_after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
config
=
{
"after_warmup_lr_sched"
:
self
.
_after_warmup_lr_sched
.
get_config
()}
# pytype: disable=attribute-error
else
:
config
=
{
"after_warmup_lr_sched"
:
self
.
_after_warmup_lr_sched
}
# pytype: disable=attribute-error
config
.
update
({
"warmup_steps"
:
self
.
_warmup_steps
,
"warmup_learning_rate"
:
self
.
_init_warmup_lr
,
"name"
:
self
.
_name
})
return
config
class
PolynomialWarmUp
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Applies polynomial warmup schedule on a given learning rate decay schedule."""
def
__init__
(
self
,
after_warmup_lr_sched
:
Union
[
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
,
float
],
warmup_steps
:
int
,
power
:
float
=
1.0
,
name
:
str
=
"PolynomialWarmup"
):
super
().
__init__
()
if
isinstance
(
after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
self
.
_initial_learning_rate
=
after_warmup_lr_sched
(
warmup_steps
)
else
:
self
.
_initial_learning_rate
=
tf
.
cast
(
after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
self
.
_warmup_steps
=
warmup_steps
self
.
_power
=
power
self
.
_after_warmup_lr_sched
=
after_warmup_lr_sched
self
.
_name
=
name
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
_name
or
"PolynomialWarmUp"
)
as
name
:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float
=
tf
.
cast
(
step
,
tf
.
float32
)
warmup_steps_float
=
tf
.
cast
(
self
.
_warmup_steps
,
tf
.
float32
)
if
self
.
_warmup_steps
<=
0
:
warmup_percent_done
=
1.0
else
:
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero
=
tf
.
math
.
maximum
(
global_step_float
,
1.0
)
warmup_percent_done
=
step_non_zero
/
warmup_steps_float
warmup_learning_rate
=
(
self
.
_initial_learning_rate
*
tf
.
math
.
pow
(
warmup_percent_done
,
self
.
_power
))
if
isinstance
(
self
.
_after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
after_warmup_lr
=
self
.
_after_warmup_lr_sched
(
step
)
else
:
after_warmup_lr
=
tf
.
cast
(
self
.
_after_warmup_lr_sched
,
dtype
=
tf
.
float32
)
return
tf
.
cond
(
global_step_float
<
warmup_steps_float
,
lambda
:
warmup_learning_rate
,
lambda
:
after_warmup_lr
,
name
=
name
)
def
get_config
(
self
)
->
Mapping
[
str
,
Any
]:
if
isinstance
(
self
.
_after_warmup_lr_sched
,
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
config
=
{
"after_warmup_lr_sched"
:
self
.
_after_warmup_lr_sched
.
get_config
()}
# pytype: disable=attribute-error
else
:
config
=
{
"after_warmup_lr_sched"
:
self
.
_after_warmup_lr_sched
}
# pytype: disable=attribute-error
config
.
update
({
"warmup_steps"
:
self
.
_warmup_steps
,
"power"
:
self
.
_power
,
"name"
:
self
.
_name
})
return
config
class
DirectPowerDecay
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Learning rate schedule follows lr * (step)^power."""
def
__init__
(
self
,
initial_learning_rate
:
float
,
power
:
float
=
1.0
,
name
:
str
=
"DirectPowerDecay"
):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
power: The order of the polynomial.
name: Optional, name of learning rate schedule.
"""
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_power
=
power
self
.
_name
=
name
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
_name
or
"DirectPowerDecay"
):
step
=
tf
.
cast
(
step
,
tf
.
float32
)
learning_rate
=
self
.
_initial_learning_rate
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero
=
tf
.
math
.
maximum
(
step
,
1.0
)
learning_rate
*=
tf
.
math
.
pow
(
step_non_zero
,
self
.
_power
)
return
learning_rate
def
get_config
(
self
):
"""Get the configuration of the learning rate schedule."""
return
{
"initial_learning_rate"
:
self
.
_initial_learning_rate
,
"power"
:
self
.
_power
,
"name"
:
self
.
_name
,
}
class
PowerAndLinearDecay
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Learning rate schedule with multiplied by linear decay at the end.
The schedule has the following behavoir.
Let offset_step = step - offset.
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step >= total_decay_steps, the actual learning rate equals zero.
"""
def
__init__
(
self
,
initial_learning_rate
:
float
,
total_decay_steps
:
int
,
power
:
float
=
1.0
,
linear_decay_fraction
:
float
=
0.1
,
offset
:
int
=
0
,
name
:
str
=
"PowerAndLinearDecay"
):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
total_decay_steps: The total number of steps for power + linear decay.
power: The order of the polynomial.
linear_decay_fraction: In the last `linear_decay_fraction` steps, the
learning rate will be multiplied by a linear decay.
offset: The offset applied to steps.
name: Optional, name of learning rate schedule.
"""
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_total_decay_steps
=
total_decay_steps
self
.
_power
=
power
self
.
_linear_decay_fraction
=
linear_decay_fraction
self
.
_offset
=
offset
self
.
_name
=
name
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
_name
or
"PowerAndLinearDecay"
):
step
=
tf
.
cast
(
step
-
self
.
_offset
,
tf
.
float32
)
learning_rate
=
self
.
_initial_learning_rate
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero
=
tf
.
math
.
maximum
(
step
,
1.0
)
learning_rate
*=
tf
.
math
.
pow
(
step_non_zero
,
self
.
_power
)
if
self
.
_total_decay_steps
*
self
.
_linear_decay_fraction
>
0
:
learning_rate
*=
tf
.
minimum
(
1.0
,
(
self
.
_total_decay_steps
-
step
)
/
(
self
.
_total_decay_steps
*
self
.
_linear_decay_fraction
))
learning_rate
=
tf
.
maximum
(
0.0
,
learning_rate
)
return
learning_rate
def
get_config
(
self
):
"""Get the configuration of the learning rate schedule."""
return
{
"initial_learning_rate"
:
self
.
_initial_learning_rate
,
"total_decay_steps"
:
self
.
_total_decay_steps
,
"power"
:
self
.
_power
,
"linear_decay_fraction"
:
self
.
_linear_decay_fraction
,
"offset"
:
self
.
_offset
,
"name"
:
self
.
_name
,
}
class
PowerDecayWithOffset
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Power learning rate decay with offset.
Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
Otherwise, learning rate equals to lr * (step - offset)^power.
"""
def
__init__
(
self
,
initial_learning_rate
:
float
,
power
:
float
=
1.0
,
offset
:
int
=
0
,
pre_offset_learning_rate
:
float
=
1.0e6
,
name
:
str
=
"PowerDecayWithOffset"
):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
power: The order of the polynomial.
offset: The offset when computing the power decay.
pre_offset_learning_rate: The maximum learning rate we'll use.
name: Optional, name of learning rate schedule.
"""
super
().
__init__
()
self
.
_initial_learning_rate
=
initial_learning_rate
self
.
_power
=
power
self
.
_offset
=
offset
self
.
_pre_offset_lr
=
pre_offset_learning_rate
self
.
_name
=
name
def
__call__
(
self
,
step
):
with
tf
.
name_scope
(
self
.
_name
or
"PowerDecayWithOffset"
):
step
=
tf
.
cast
(
step
,
tf
.
float32
)
lr_after_offset
=
tf
.
math
.
pow
(
tf
.
math
.
maximum
(
step
-
self
.
_offset
,
1.0
),
self
.
_power
)
*
(
self
.
_initial_learning_rate
)
sign
=
tf
.
cast
(
step
>
self
.
_offset
,
tf
.
float32
)
lr_combined
=
(
1.0
-
sign
)
*
self
.
_pre_offset_lr
+
sign
*
lr_after_offset
# Power may give infinitely large LR. So cap it with pre_offset_lr.
return
tf
.
math
.
minimum
(
lr_combined
,
self
.
_pre_offset_lr
)
def
get_config
(
self
):
"""Get the configuration of the learning rate schedule."""
return
{
"initial_learning_rate"
:
self
.
_initial_learning_rate
,
"power"
:
self
.
_power
,
"offset"
:
self
.
_offset
,
"pre_offset_learning_rate"
:
self
.
_pre_offset_lr
,
"name"
:
self
.
_name
,
}
class
StepConsineDecayWithOffset
(
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
):
"""Stepwise cosine learning rate decay with offset.
Learning rate is equivalent to one or more consine decay(s) starting and
ending at each interval.
ExampleL
```python
boundaries: [100000, 110000]
values: [1.0, 0.5]
lr_decayed_fn = (
lr_schedule.StepConsineDecayWithOffset(
boundaries,
values))
```
from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
"""
def
__init__
(
self
,
boundaries
,
values
,
offset
:
int
=
0
,
name
:
str
=
"StepConsineDecayWithOffset"
):
"""Initialize configuration of the learning rate schedule.
Args:
boundaries: A list of `Tensor`s or `int`s with strictly
increasing entries, and with all elements having the same type as the
optimizer step.
values: A list of `Tensor`s or `float`s that specifies the
values for the intervals defined by `boundaries`. It should have one
more element than `boundaries`, and all elements should have the same
type.
offset: The offset when computing the power decay.
name: Optional, name of learning rate schedule.
"""
super
().
__init__
()
self
.
values
=
values
self
.
boundaries
=
boundaries
self
.
offset
=
offset
self
.
name
=
name
if
len
(
self
.
values
)
<
1
:
raise
ValueError
(
f
"Expect non empty
{
self
.
values
}
"
)
if
len
(
self
.
boundaries
)
!=
len
(
self
.
values
):
raise
ValueError
(
"Boundaries length is equal to learning rate levels length"
f
"
{
len
(
self
.
boundaries
)
}
!=
{
len
(
self
.
values
)
}
"
)
self
.
total_steps
=
(
[
boundaries
[
i
+
1
]
-
boundaries
[
i
]
for
i
in
range
(
len
(
boundaries
)
-
1
)
]
+
[
0
])
def
__call__
(
self
,
global_step
):
with
tf
.
name_scope
(
self
.
name
or
"StepConsineDecayWithOffset"
):
global_step
=
tf
.
cast
(
global_step
-
self
.
offset
,
tf
.
float32
)
lr_levels
=
self
.
values
lr_steps
=
self
.
boundaries
level_total_steps
=
self
.
total_steps
num_levels
=
len
(
lr_levels
)
init_lr
=
lr_levels
[
0
]
next_init_lr
=
lr_levels
[
1
]
if
num_levels
>
1
else
0.
init_total_steps
=
level_total_steps
[
0
]
cosine_learning_rate
=
((
init_lr
-
next_init_lr
)
*
(
tf
.
cos
(
tf
.
constant
(
math
.
pi
)
*
(
global_step
)
/
(
init_total_steps
))
+
1.0
)
/
2.0
+
next_init_lr
)
learning_rate
=
cosine_learning_rate
tf
.
compat
.
v1
.
logging
.
info
(
"DEBUG lr %r next lr %r"
,
learning_rate
,
cosine_learning_rate
)
tf
.
compat
.
v1
.
logging
.
info
(
"DEBUG lr %r next lr %r inittotalstep %r"
,
init_lr
,
next_init_lr
,
init_total_steps
)
for
i
in
range
(
1
,
num_levels
):
next_init_lr
=
lr_levels
[
i
]
next_start_step
=
lr_steps
[
i
]
next_total_steps
=
level_total_steps
[
i
]
next_next_init_lr
=
lr_levels
[
i
+
1
]
if
num_levels
>
i
+
1
else
0.
tf
.
compat
.
v1
.
logging
.
info
(
"DEBUG step %r nilr %r nss %r nts %r nnilr %r"
,
global_step
,
next_init_lr
,
next_start_step
,
next_total_steps
,
next_next_init_lr
)
next_cosine_learning_rate
=
((
next_init_lr
-
next_next_init_lr
)
*
(
tf
.
cos
(
tf
.
constant
(
math
.
pi
)
*
(
global_step
-
next_start_step
)
/
(
next_total_steps
))
+
1.0
)
/
2.0
+
next_next_init_lr
)
learning_rate
=
tf
.
where
(
global_step
>=
next_start_step
,
next_cosine_learning_rate
,
learning_rate
)
tf
.
compat
.
v1
.
logging
.
info
(
"DEBUG lr %r next lr %r"
,
learning_rate
,
next_cosine_learning_rate
)
return
learning_rate
def
get_config
(
self
):
return
{
"boundaries"
:
self
.
boundaries
,
"values"
:
self
.
values
,
"offset"
:
self
.
offset
,
"name"
:
self
.
name
}
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/lr_schedule_test.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for lr_schedule."""
from
absl.testing
import
parameterized
import
tensorflow
as
tf
from
official.modeling.optimization
import
lr_schedule
class
PowerAndLinearDecayTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
named_parameters
(
dict
(
testcase_name
=
'power_only'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.0
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
40
,
1.
/
40.
],
[
60
,
1.
/
60
],
[
100
,
1.
/
100
]]),
dict
(
testcase_name
=
'linear_only'
,
init_lr
=
1.0
,
power
=
0.0
,
linear_decay_fraction
=
1.0
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
0.99
],
[
40
,
0.6
],
[
60
,
0.4
],
[
100
,
0.0
]]),
dict
(
testcase_name
=
'general'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.5
,
total_decay_steps
=
100
,
offset
=
0
,
expected
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
40
,
1.
/
40.
],
[
60
,
1.
/
60.
*
0.8
],
[
100
,
0.0
]]),
dict
(
testcase_name
=
'offset'
,
init_lr
=
1.0
,
power
=-
1.0
,
linear_decay_fraction
=
0.5
,
total_decay_steps
=
100
,
offset
=
90
,
expected
=
[[
0
,
1.0
],
[
90
,
1.0
],
[
91
,
1.0
],
[
130
,
1.
/
40.
],
[
150
,
1.
/
60.
*
0.8
],
[
190
,
0.0
],
[
200
,
0.0
]]),
)
def
test_power_linear_lr_schedule
(
self
,
init_lr
,
power
,
linear_decay_fraction
,
total_decay_steps
,
offset
,
expected
):
lr
=
lr_schedule
.
PowerAndLinearDecay
(
initial_learning_rate
=
init_lr
,
power
=
power
,
linear_decay_fraction
=
linear_decay_fraction
,
total_decay_steps
=
total_decay_steps
,
offset
=
offset
)
for
step
,
value
in
expected
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
class
OffsetLearningRateTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
(
dict
(
class_name
=
lr_schedule
.
PiecewiseConstantDecayWithOffset
),
dict
(
class_name
=
lr_schedule
.
PolynomialDecayWithOffset
),
dict
(
class_name
=
lr_schedule
.
ExponentialDecayWithOffset
),
dict
(
class_name
=
lr_schedule
.
CosineDecayWithOffset
),
)
def
test_generated_docstring
(
self
,
class_name
):
self
.
assertNotEmpty
(
class_name
.
__init__
.
__doc__
)
@
parameterized
.
parameters
(
dict
(
class_name
=
lr_schedule
.
PiecewiseConstantDecayWithOffset
,
kwarg
=
dict
(
boundaries
=
[
50
,
80
],
values
=
[
1.0
,
0.5
,
0.1
])),
dict
(
class_name
=
lr_schedule
.
PolynomialDecayWithOffset
,
kwarg
=
dict
(
initial_learning_rate
=
1.0
,
decay_steps
=
100
)),
dict
(
class_name
=
lr_schedule
.
ExponentialDecayWithOffset
,
kwarg
=
dict
(
initial_learning_rate
=
1.0
,
decay_steps
=
100
,
decay_rate
=
0.5
)),
dict
(
class_name
=
lr_schedule
.
CosineDecayWithOffset
,
kwarg
=
dict
(
initial_learning_rate
=
1.0
,
decay_steps
=
100
)),
)
def
test_offset
(
self
,
class_name
,
kwarg
):
offset
=
10
offset_lr
=
class_name
(
offset
=
offset
,
**
kwarg
)
base_lr
=
class_name
.
base_lr_class
(
**
kwarg
)
self
.
assertIsInstance
(
offset_lr
,
class_name
)
for
step
in
range
(
10
,
101
,
10
):
self
.
assertEqual
(
offset_lr
(
step
),
base_lr
(
step
-
offset
))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimizer factory class."""
from
typing
import
Callable
,
Optional
,
Union
,
List
,
Tuple
import
gin
import
tensorflow
as
tf
import
tensorflow_addons.optimizers
as
tfa_optimizers
from
official.modeling.optimization
import
slide_optimizer
from
official.modeling.optimization
import
adafactor_optimizer
from
official.modeling.optimization
import
ema_optimizer
from
official.modeling.optimization
import
lars_optimizer
from
official.modeling.optimization
import
lr_schedule
from
official.modeling.optimization.configs
import
optimization_config
as
opt_cfg
from
official.nlp
import
optimization
as
nlp_optimization
OPTIMIZERS_CLS
=
{
'sgd'
:
tf
.
keras
.
optimizers
.
SGD
,
'adam'
:
tf
.
keras
.
optimizers
.
Adam
,
'adamw'
:
nlp_optimization
.
AdamWeightDecay
,
'lamb'
:
tfa_optimizers
.
LAMB
,
'rmsprop'
:
tf
.
keras
.
optimizers
.
RMSprop
,
'lars'
:
lars_optimizer
.
LARS
,
'adagrad'
:
tf
.
keras
.
optimizers
.
Adagrad
,
'slide'
:
slide_optimizer
.
SLIDE
,
'adafactor'
:
adafactor_optimizer
.
Adafactor
,
}
LR_CLS
=
{
'stepwise'
:
lr_schedule
.
PiecewiseConstantDecayWithOffset
,
'polynomial'
:
lr_schedule
.
PolynomialDecayWithOffset
,
'exponential'
:
lr_schedule
.
ExponentialDecayWithOffset
,
'cosine'
:
lr_schedule
.
CosineDecayWithOffset
,
'power'
:
lr_schedule
.
DirectPowerDecay
,
'power_linear'
:
lr_schedule
.
PowerAndLinearDecay
,
'power_with_offset'
:
lr_schedule
.
PowerDecayWithOffset
,
'step_cosine_with_offset'
:
lr_schedule
.
StepConsineDecayWithOffset
,
}
WARMUP_CLS
=
{
'linear'
:
lr_schedule
.
LinearWarmup
,
'polynomial'
:
lr_schedule
.
PolynomialWarmUp
}
def
register_optimizer_cls
(
key
:
str
,
optimizer_config_cls
:
tf
.
keras
.
optimizers
.
Optimizer
):
"""Register customize optimizer cls.
The user will still need to subclass data classes in
configs.optimization_config to be used with OptimizerFactory.
Args:
key: A string to that the optimizer_config_cls is registered with.
optimizer_config_cls: A class which inherits tf.keras.optimizers.Optimizer.
"""
if
key
in
OPTIMIZERS_CLS
:
raise
ValueError
(
'%s already registered in OPTIMIZER_CLS.'
%
key
)
OPTIMIZERS_CLS
[
key
]
=
optimizer_config_cls
class
OptimizerFactory
:
"""Optimizer factory class.
This class builds learning rate and optimizer based on an optimization config.
To use this class, you need to do the following:
(1) Define optimization config, this includes optimizer, and learning rate
schedule.
(2) Initialize the class using the optimization config.
(3) Build learning rate.
(4) Build optimizer.
This is a typical example for using this class:
params = {
'optimizer': {
'type': 'sgd',
'sgd': {'momentum': 0.9}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {'boundaries': [10000, 20000],
'values': [0.1, 0.01, 0.001]}
},
'warmup': {
'type': 'linear',
'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
}
}
opt_config = OptimizationConfig(params)
opt_factory = OptimizerFactory(opt_config)
lr = opt_factory.build_learning_rate()
optimizer = opt_factory.build_optimizer(lr)
"""
def
__init__
(
self
,
config
:
opt_cfg
.
OptimizationConfig
):
"""Initializing OptimizerFactory.
Args:
config: OptimizationConfig instance contain optimization config.
"""
self
.
_config
=
config
self
.
_optimizer_config
=
config
.
optimizer
.
get
()
self
.
_optimizer_type
=
config
.
optimizer
.
type
self
.
_use_ema
=
config
.
ema
is
not
None
self
.
_ema_config
=
config
.
ema
if
self
.
_optimizer_config
is
None
:
raise
ValueError
(
'Optimizer type must be specified'
)
self
.
_lr_config
=
config
.
learning_rate
.
get
()
self
.
_lr_type
=
config
.
learning_rate
.
type
if
self
.
_lr_type
is
None
:
raise
ValueError
(
'Learning rate type must be specified'
)
self
.
_warmup_config
=
config
.
warmup
.
get
()
self
.
_warmup_type
=
config
.
warmup
.
type
def
build_learning_rate
(
self
):
"""Build learning rate.
Builds learning rate from config. Learning rate schedule is built according
to the learning rate config. If learning rate type is consant,
lr_config.learning_rate is returned.
Returns:
tf.keras.optimizers.schedules.LearningRateSchedule instance. If
learning rate type is consant, lr_config.learning_rate is returned.
"""
if
self
.
_lr_type
==
'constant'
:
lr
=
self
.
_lr_config
.
learning_rate
else
:
lr
=
LR_CLS
[
self
.
_lr_type
](
**
self
.
_lr_config
.
as_dict
())
if
self
.
_warmup_config
:
lr
=
WARMUP_CLS
[
self
.
_warmup_type
](
lr
,
**
self
.
_warmup_config
.
as_dict
())
return
lr
@
gin
.
configurable
def
build_optimizer
(
self
,
lr
:
Union
[
tf
.
keras
.
optimizers
.
schedules
.
LearningRateSchedule
,
float
],
gradient_transformers
:
Optional
[
List
[
Callable
[
[
List
[
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]]],
List
[
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]]
]]]
=
None
,
postprocessor
:
Optional
[
Callable
[[
tf
.
keras
.
optimizers
.
Optimizer
],
tf
.
keras
.
optimizers
.
Optimizer
]]
=
None
):
"""Build optimizer.
Builds optimizer from config. It takes learning rate as input, and builds
the optimizer according to the optimizer config. Typically, the learning
rate built using self.build_lr() is passed as an argument to this method.
Args:
lr: A floating point value, or a
tf.keras.optimizers.schedules.LearningRateSchedule instance.
gradient_transformers: Optional list of functions to use to transform
gradients before applying updates to Variables. The functions are
applied after gradient_aggregator. The functions should accept and
return a list of (gradient, variable) tuples. clipvalue, clipnorm,
global_clipnorm should not be set when gradient_transformers is passed.
postprocessor: An optional function for postprocessing the optimizer. It
takes an optimizer and returns an optimizer.
Returns:
tf.keras.optimizers.Optimizer instance.
"""
optimizer_dict
=
self
.
_optimizer_config
.
as_dict
()
## Delete clipnorm, clipvalue, global_clipnorm if None
if
optimizer_dict
[
'clipnorm'
]
is
None
:
del
optimizer_dict
[
'clipnorm'
]
if
optimizer_dict
[
'clipvalue'
]
is
None
:
del
optimizer_dict
[
'clipvalue'
]
if
optimizer_dict
[
'global_clipnorm'
]
is
None
:
del
optimizer_dict
[
'global_clipnorm'
]
optimizer_dict
[
'learning_rate'
]
=
lr
if
gradient_transformers
is
not
None
:
optimizer_dict
[
'gradient_transformers'
]
=
gradient_transformers
optimizer
=
OPTIMIZERS_CLS
[
self
.
_optimizer_type
](
**
optimizer_dict
)
if
self
.
_use_ema
:
optimizer
=
ema_optimizer
.
ExponentialMovingAverage
(
optimizer
,
**
self
.
_ema_config
.
as_dict
())
if
postprocessor
:
optimizer
=
postprocessor
(
optimizer
)
assert
isinstance
(
optimizer
,
tf
.
keras
.
optimizers
.
Optimizer
),
(
'OptimizerFactory.build_optimizer returning a non-optimizer object: '
'{}'
.
format
(
optimizer
))
return
optimizer
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/optimizer_factory_test.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for optimizer_factory.py."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.modeling.optimization
import
optimizer_factory
from
official.modeling.optimization.configs
import
optimization_config
class
OptimizerFactoryTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
parameters
((
'sgd'
),
(
'rmsprop'
),
(
'adam'
),
(
'adamw'
),
(
'lamb'
),
(
'lars'
),
(
'adagrad'
))
def
test_optimizers
(
self
,
optimizer_type
):
params
=
{
'optimizer'
:
{
'type'
:
optimizer_type
},
'learning_rate'
:
{
'type'
:
'constant'
,
'constant'
:
{
'learning_rate'
:
0.1
}
}
}
optimizer_cls
=
optimizer_factory
.
OPTIMIZERS_CLS
[
optimizer_type
]
expected_optimizer_config
=
optimizer_cls
().
get_config
()
expected_optimizer_config
[
'learning_rate'
]
=
0.1
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
optimizer
=
opt_factory
.
build_optimizer
(
lr
,
postprocessor
=
lambda
x
:
x
)
self
.
assertIsInstance
(
optimizer
,
optimizer_cls
)
self
.
assertEqual
(
expected_optimizer_config
,
optimizer
.
get_config
())
@
parameterized
.
parameters
((
None
,
None
),
(
1.0
,
None
),
(
None
,
1.0
))
def
test_gradient_clipping
(
self
,
clipnorm
,
clipvalue
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'clipnorm'
:
clipnorm
,
'clipvalue'
:
clipvalue
}
},
'learning_rate'
:
{
'type'
:
'constant'
,
'constant'
:
{
'learning_rate'
:
1.0
}
}
}
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
optimizer
=
opt_factory
.
build_optimizer
(
lr
)
var0
=
tf
.
Variable
([
1.0
,
2.0
])
var1
=
tf
.
Variable
([
3.0
,
4.0
])
grads0
=
tf
.
constant
([
0.1
,
0.1
])
grads1
=
tf
.
constant
([
2.0
,
3.0
])
grads_and_vars
=
list
(
zip
([
grads0
,
grads1
],
[
var0
,
var1
]))
optimizer
.
apply_gradients
(
grads_and_vars
)
self
.
assertAllClose
(
np
.
array
([
0.9
,
1.9
]),
var0
.
numpy
())
if
clipvalue
is
not
None
:
self
.
assertAllClose
(
np
.
array
([
2.0
,
3.0
]),
var1
.
numpy
())
elif
clipnorm
is
not
None
:
self
.
assertAllClose
(
np
.
array
([
2.4452999
,
3.1679497
]),
var1
.
numpy
())
else
:
self
.
assertAllClose
(
np
.
array
([
1.0
,
1.0
]),
var1
.
numpy
())
def
test_missing_types
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}}}
with
self
.
assertRaises
(
ValueError
):
optimizer_factory
.
OptimizerFactory
(
optimization_config
.
OptimizationConfig
(
params
))
params
=
{
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
10000
,
20000
],
'values'
:
[
0.1
,
0.01
,
0.001
]
}
}
}
with
self
.
assertRaises
(
ValueError
):
optimizer_factory
.
OptimizerFactory
(
optimization_config
.
OptimizationConfig
(
params
))
# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
def
test_stepwise_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
10000
,
20000
],
'values'
:
[
0.1
,
0.01
,
0.001
]
}
}
}
expected_lr_step_values
=
[[
0
,
0.1
],
[
5000
,
0.1
],
[
10000
,
0.1
],
[
10001
,
0.01
],
[
20000
,
0.01
],
[
20001
,
0.001
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_stepwise_lr_with_warmup_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
10000
,
20000
],
'values'
:
[
0.1
,
0.01
,
0.001
]
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
500
,
'warmup_learning_rate'
:
0.01
}
}
}
expected_lr_step_values
=
[[
0
,
0.01
],
[
250
,
0.055
],
[
500
,
0.1
],
[
5500
,
0.1
],
[
10000
,
0.1
],
[
10001
,
0.01
],
[
20000
,
0.01
],
[
20001
,
0.001
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_exponential_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'exponential'
,
'exponential'
:
{
'initial_learning_rate'
:
0.1
,
'decay_steps'
:
1000
,
'decay_rate'
:
0.96
,
'staircase'
:
True
}
}
}
expected_lr_step_values
=
[
[
0
,
0.1
],
[
999
,
0.1
],
[
1000
,
0.096
],
[
1999
,
0.096
],
[
2000
,
0.09216
],
]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_polynomial_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
0.1
,
'decay_steps'
:
1000
,
'end_learning_rate'
:
0.001
}
}
}
expected_lr_step_values
=
[[
0
,
0.1
],
[
500
,
0.0505
],
[
1000
,
0.001
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_cosine_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'cosine'
,
'cosine'
:
{
'initial_learning_rate'
:
0.1
,
'decay_steps'
:
1000
}
}
}
expected_lr_step_values
=
[[
0
,
0.1
],
[
250
,
0.08535534
],
[
500
,
0.04999999
],
[
750
,
0.01464466
],
[
1000
,
0
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_constant_lr_with_warmup_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'constant'
,
'constant'
:
{
'learning_rate'
:
0.1
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
500
,
'warmup_learning_rate'
:
0.01
}
}
}
expected_lr_step_values
=
[[
0
,
0.01
],
[
250
,
0.055
],
[
500
,
0.1
],
[
5000
,
0.1
],
[
10000
,
0.1
],
[
20000
,
0.1
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_stepwise_lr_with_polynomial_warmup_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'stepwise'
,
'stepwise'
:
{
'boundaries'
:
[
10000
,
20000
],
'values'
:
[
0.1
,
0.01
,
0.001
]
}
},
'warmup'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'warmup_steps'
:
500
,
'power'
:
2.
}
}
}
expected_lr_step_values
=
[[
0
,
0.0
],
[
250
,
0.025
],
[
500
,
0.1
],
[
5500
,
0.1
],
[
10000
,
0.1
],
[
10001
,
0.01
],
[
20000
,
0.01
],
[
20001
,
0.001
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
,
places
=
6
)
def
test_power_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'power'
,
'power'
:
{
'initial_learning_rate'
:
1.0
,
'power'
:
-
1.0
}
}
}
expected_lr_step_values
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
250
,
1.
/
250.
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_power_linear_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'power_linear'
,
'power_linear'
:
{
'initial_learning_rate'
:
1.0
,
'power'
:
-
1.0
,
'linear_decay_fraction'
:
0.5
,
'total_decay_steps'
:
100
,
'offset'
:
0
,
}
}
}
expected_lr_step_values
=
[[
0
,
1.0
],
[
1
,
1.0
],
[
40
,
1.
/
40.
],
[
60
,
1.
/
60.
*
0.8
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_power_with_offset_lr_schedule
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'power_with_offset'
,
'power_with_offset'
:
{
'initial_learning_rate'
:
1.0
,
'power'
:
-
1.0
,
'offset'
:
10
,
'pre_offset_learning_rate'
:
3.0
,
}
}
}
expected_lr_step_values
=
[[
1
,
3.0
],
[
10
,
3.0
],
[
20
,
1.
/
10.
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
def
test_step_cosine_lr_schedule_with_warmup
(
self
):
params
=
{
'optimizer'
:
{
'type'
:
'sgd'
,
'sgd'
:
{
'momentum'
:
0.9
}
},
'learning_rate'
:
{
'type'
:
'step_cosine_with_offset'
,
'step_cosine_with_offset'
:
{
'values'
:
(
0.0001
,
0.00005
),
'boundaries'
:
(
0
,
500000
),
'offset'
:
10000
,
}
},
'warmup'
:
{
'type'
:
'linear'
,
'linear'
:
{
'warmup_steps'
:
10000
,
'warmup_learning_rate'
:
0.0
}
}
}
expected_lr_step_values
=
[[
0
,
0.0
],
[
5000
,
1e-4
/
2.0
],
[
10000
,
1e-4
],
[
20000
,
9.994863e-05
],
[
499999
,
5e-05
]]
opt_config
=
optimization_config
.
OptimizationConfig
(
params
)
opt_factory
=
optimizer_factory
.
OptimizerFactory
(
opt_config
)
lr
=
opt_factory
.
build_learning_rate
()
for
step
,
value
in
expected_lr_step_values
:
self
.
assertAlmostEqual
(
lr
(
step
).
numpy
(),
value
)
class
OptimizerFactoryRegistryTest
(
tf
.
test
.
TestCase
):
def
test_registry
(
self
):
class
MyClass
():
pass
optimizer_factory
.
register_optimizer_cls
(
'test'
,
MyClass
)
self
.
assertIn
(
'test'
,
optimizer_factory
.
OPTIMIZERS_CLS
)
with
self
.
assertRaisesRegex
(
ValueError
,
'test already registered.*'
):
optimizer_factory
.
register_optimizer_cls
(
'test'
,
MyClass
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/optimization/slide_optimizer.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SLIDE optimizer.
A new optimizer that will be open sourced soon.
"""
SLIDE
=
"Unimplemented"
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/performance.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to training performance."""
import
tensorflow
as
tf
def
configure_optimizer
(
optimizer
,
use_float16
=
False
,
use_graph_rewrite
=
False
,
loss_scale
=
None
):
"""Configures optimizer object with performance options."""
if
use_float16
:
if
loss_scale
in
(
None
,
'dynamic'
):
optimizer
=
tf
.
keras
.
mixed_precision
.
LossScaleOptimizer
(
optimizer
)
else
:
# loss_scale is a number. We interpret that as a fixed loss scale.
optimizer
=
tf
.
keras
.
mixed_precision
.
LossScaleOptimizer
(
optimizer
,
dynamic
=
False
,
initial_scale
=
loss_scale
)
if
use_graph_rewrite
:
# Note: the model dtype must be 'float32', which will ensure
# tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
# double up.
optimizer
=
(
tf
.
compat
.
v1
.
mixed_precision
.
enable_mixed_precision_graph_rewrite
(
optimizer
))
return
optimizer
def
set_mixed_precision_policy
(
dtype
,
loss_scale
=
None
):
"""Sets the global `tf.keras.mixed_precision.Policy`."""
# TODO(b/191894773): Remove loss_scale argument
assert
loss_scale
is
None
,
(
'The loss_scale argument must be None. The argument exists for '
'historical reasons and will be removed soon.'
)
if
dtype
==
tf
.
float16
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_float16'
)
elif
dtype
==
tf
.
bfloat16
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_bfloat16'
)
elif
dtype
==
tf
.
float32
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'float32'
)
else
:
raise
ValueError
(
'Unexpected dtype: %s'
%
dtype
)
TensorFlow2x/ComputeVision/Classification/models-master/official/modeling/tf_utils.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common TF utilities."""
import
six
import
tensorflow
as
tf
from
tensorflow.python.util
import
deprecation
from
official.modeling
import
activations
@
deprecation
.
deprecated
(
None
,
"tf.keras.layers.Layer supports multiple positional args and kwargs as "
"input tensors. pack/unpack inputs to override __call__ is no longer "
"needed."
)
def
pack_inputs
(
inputs
):
"""Pack a list of `inputs` tensors to a tuple.
Args:
inputs: a list of tensors.
Returns:
a tuple of tensors. if any input is None, replace it with a special constant
tensor.
"""
inputs
=
tf
.
nest
.
flatten
(
inputs
)
outputs
=
[]
for
x
in
inputs
:
if
x
is
None
:
outputs
.
append
(
tf
.
constant
(
0
,
shape
=
[],
dtype
=
tf
.
int32
))
else
:
outputs
.
append
(
x
)
return
tuple
(
outputs
)
@
deprecation
.
deprecated
(
None
,
"tf.keras.layers.Layer supports multiple positional args and kwargs as "
"input tensors. pack/unpack inputs to override __call__ is no longer "
"needed."
)
def
unpack_inputs
(
inputs
):
"""unpack a tuple of `inputs` tensors to a tuple.
Args:
inputs: a list of tensors.
Returns:
a tuple of tensors. if any input is a special constant tensor, replace it
with None.
"""
inputs
=
tf
.
nest
.
flatten
(
inputs
)
outputs
=
[]
for
x
in
inputs
:
if
is_special_none_tensor
(
x
):
outputs
.
append
(
None
)
else
:
outputs
.
append
(
x
)
x
=
tuple
(
outputs
)
# To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
# from triggering.
if
len
(
x
)
==
1
:
return
x
[
0
]
return
tuple
(
outputs
)
def
is_special_none_tensor
(
tensor
):
"""Checks if a tensor is a special None Tensor."""
return
tensor
.
shape
.
ndims
==
0
and
tensor
.
dtype
==
tf
.
int32
def
get_activation
(
identifier
,
use_keras_layer
=
False
):
"""Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
It checks string first and if it is one of customized activation not in TF,
the corresponding activation will be returned. For non-customized activation
names and callable identifiers, always fallback to tf.keras.activations.get.
Prefers using keras layers when use_keras_layer=True. Now it only supports
'relu', 'linear', 'identity', 'swish'.
Args:
identifier: String name of the activation function or callable.
use_keras_layer: If True, use keras layer if identifier is allow-listed.
Returns:
A Python function corresponding to the activation function or a keras
activation layer when use_keras_layer=True.
"""
if
isinstance
(
identifier
,
six
.
string_types
):
identifier
=
str
(
identifier
).
lower
()
if
use_keras_layer
:
keras_layer_allowlist
=
{
"relu"
:
"relu"
,
"linear"
:
"linear"
,
"identity"
:
"linear"
,
"swish"
:
"swish"
,
"sigmoid"
:
"sigmoid"
,
"relu6"
:
tf
.
nn
.
relu6
,
}
if
identifier
in
keras_layer_allowlist
:
return
tf
.
keras
.
layers
.
Activation
(
keras_layer_allowlist
[
identifier
])
name_to_fn
=
{
"gelu"
:
activations
.
gelu
,
"simple_swish"
:
activations
.
simple_swish
,
"hard_swish"
:
activations
.
hard_swish
,
"relu6"
:
activations
.
relu6
,
"hard_sigmoid"
:
activations
.
hard_sigmoid
,
"identity"
:
activations
.
identity
,
}
if
identifier
in
name_to_fn
:
return
tf
.
keras
.
activations
.
get
(
name_to_fn
[
identifier
])
return
tf
.
keras
.
activations
.
get
(
identifier
)
def
get_shape_list
(
tensor
,
expected_rank
=
None
,
name
=
None
):
"""Returns a list of the shape of tensor, preferring static dimensions.
Args:
tensor: A tf.Tensor object to find the shape of.
expected_rank: (optional) int. The expected rank of `tensor`. If this is
specified and the `tensor` has a different rank, and exception will be
thrown.
name: Optional name of the tensor for the error message.
Returns:
A list of dimensions of the shape of tensor. All static dimensions will
be returned as python integers, and dynamic dimensions will be returned
as tf.Tensor scalars.
"""
if
expected_rank
is
not
None
:
assert_rank
(
tensor
,
expected_rank
,
name
)
shape
=
tensor
.
shape
.
as_list
()
non_static_indexes
=
[]
for
(
index
,
dim
)
in
enumerate
(
shape
):
if
dim
is
None
:
non_static_indexes
.
append
(
index
)
if
not
non_static_indexes
:
return
shape
dyn_shape
=
tf
.
shape
(
tensor
)
for
index
in
non_static_indexes
:
shape
[
index
]
=
dyn_shape
[
index
]
return
shape
def
assert_rank
(
tensor
,
expected_rank
,
name
=
None
):
"""Raises an exception if the tensor rank is not of the expected rank.
Args:
tensor: A tf.Tensor to check the rank of.
expected_rank: Python integer or list of integers, expected rank.
name: Optional name of the tensor for the error message.
Raises:
ValueError: If the expected shape doesn't match the actual shape.
"""
expected_rank_dict
=
{}
if
isinstance
(
expected_rank
,
six
.
integer_types
):
expected_rank_dict
[
expected_rank
]
=
True
else
:
for
x
in
expected_rank
:
expected_rank_dict
[
x
]
=
True
actual_rank
=
tensor
.
shape
.
ndims
if
actual_rank
not
in
expected_rank_dict
:
raise
ValueError
(
"For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
"equal to the expected tensor rank `%s`"
%
(
name
,
actual_rank
,
str
(
tensor
.
shape
),
str
(
expected_rank
)))
def
safe_mean
(
losses
):
"""Computes a safe mean of the losses.
Args:
losses: `Tensor` whose elements contain individual loss measurements.
Returns:
A scalar representing the mean of `losses`. If `num_present` is zero,
then zero is returned.
"""
total
=
tf
.
reduce_sum
(
losses
)
num_elements
=
tf
.
cast
(
tf
.
size
(
losses
),
dtype
=
losses
.
dtype
)
return
tf
.
math
.
divide_no_nan
(
total
,
num_elements
)
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/README.md
0 → 100644
View file @
a32ffa95
# TensorFlow NLP Modelling Toolkit
This codebase provides a Natrual Language Processing modeling toolkit written in
[
TF2
](
https://www.tensorflow.org/guide/effective_tf2
)
. It allows researchers and
developers to reproduce state-of-the-art model results and train custom models
to experiment new research ideas.
## Features
*
Reusable and modularized modeling building blocks
*
State-of-the-art reproducible
*
Easy to customize and extend
*
End-to-end training
*
Distributed trainable on both GPUs and TPUs
## Major components
### Libraries
We provide modeling library to allow users to train custom models for new
research ideas. Detailed intructions can be found in READMEs in each folder.
*
[
modeling/
](
modeling
)
: modeling library that provides building blocks
(e.g.,Layers, Networks, and Models) that can be assembled into
transformer-based achitectures .
*
[
data/
](
data
)
: binaries and utils for input preprocessing, tokenization,
etc.
### State-of-the-Art models and examples
We provide SoTA model implementations, pre-trained models, training and
evaluation examples, and command lines. Detail instructions can be found in the
READMEs for specific papers.
1.
[
BERT
](
bert
)
:
[
BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding
](
https://arxiv.org/abs/1810.04805
)
by Devlin et al.,
2018
2.
[
ALBERT
](
albert
)
:
[
A Lite BERT for Self-supervised Learning of Language Representations
](
https://arxiv.org/abs/1909.11942
)
by Lan et al., 2019
3.
[
XLNet
](
xlnet
)
:
[
XLNet: Generalized Autoregressive Pretraining for Language Understanding
](
https://arxiv.org/abs/1906.08237
)
by Yang et al., 2019
4.
[
Transformer for translation
](
transformer
)
:
[
Attention Is All You Need
](
https://arxiv.org/abs/1706.03762
)
by Vaswani et
al., 2017
### Common Training Driver
We provide a single common driver
[
train.py
](
train.py
)
to train above SoTA
models on popluar tasks. Please see
[
docs/train.md
](
docs/train.md
)
for
more details.
### Pre-trained models with checkpoints and TF-Hub
We provide a large collection of baselines and checkpoints for NLP pre-trained
models. Please see
[
docs/pretrained_models.md
](
docs/pretrained_models.md
)
for
more details.
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/__init__.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/README.md
0 → 100644
View file @
a32ffa95
# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
**WARNING**
: We are on the way to deprecate this directory.
We will add documentation in
`nlp/docs`
to use the new code in
`nlp/modeling`
.
The academic paper which describes ALBERT in detail and provides full results on
a number of tasks can be found here: https://arxiv.org/abs/1909.11942.
This repository contains TensorFlow 2.x implementation for ALBERT.
## Contents
*
[
Contents
](
#contents
)
*
[
Pre-trained Models
](
#pre-trained-models
)
*
[
Restoring from Checkpoints
](
#restoring-from-checkpoints
)
*
[
Set Up
](
#set-up
)
*
[
Process Datasets
](
#process-datasets
)
*
[
Fine-tuning with BERT
](
#fine-tuning-with-bert
)
*
[
Cloud GPUs and TPUs
](
#cloud-gpus-and-tpus
)
*
[
Sentence and Sentence-pair Classification Tasks
](
#sentence-and-sentence-pair-classification-tasks
)
*
[
SQuAD 1.1
](
#squad-1.1
)
## Pre-trained Models
We released both checkpoints and tf.hub modules as the pretrained models for
fine-tuning. They are TF 2.x compatible and are converted from the ALBERT v2
checkpoints released in TF 1.x official ALBERT repository
[
google-research/albert
](
https://github.com/google-research/albert
)
in order to keep consistent with ALBERT paper.
Our current released checkpoints are exactly the same as TF 1.x official ALBERT
repository.
### Access to Pretrained Checkpoints
Pretrained checkpoints can be found in the following links:
**
Note: We implemented ALBERT using Keras functional-style networks in
[
nlp/modeling
](
../modeling
)
.
ALBERT V2 models compatible with TF 2.x checkpoints are:
**
*
**[`ALBERT V2 Base`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base.tar.gz)**
:
12-layer, 768-hidden, 12-heads, 12M parameters
*
**[`ALBERT V2 Large`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_large.tar.gz)**
:
24-layer, 1024-hidden, 16-heads, 18M parameters
*
**[`ALBERT V2 XLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xlarge.tar.gz)**
:
24-layer, 2048-hidden, 32-heads, 60M parameters
*
**[`ALBERT V2 XXLarge`](https://storage.googleapis.com/cloud-tpu-checkpoints/albert/checkpoints/albert_v2_xxlarge.tar.gz)**
:
12-layer, 4096-hidden, 64-heads, 235M parameters
We recommend to host checkpoints on Google Cloud storage buckets when you use
Cloud GPU/TPU.
### Restoring from Checkpoints
`tf.train.Checkpoint`
is used to manage model checkpoints in TF 2. To restore
weights from provided pre-trained checkpoints, you can use the following code:
```
python
init_checkpoint
=
'the pretrained model checkpoint path.'
model
=
tf
.
keras
.
Model
()
# Bert pre-trained model as feature extractor.
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
model
)
checkpoint
.
restore
(
init_checkpoint
)
```
Checkpoints featuring native serialized Keras models
(i.e. model.load()/load_weights()) will be available soon.
### Access to Pretrained hub modules.
Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
following links:
*
**[`ALBERT V2 Base`](https://tfhub.dev/tensorflow/albert_en_base/1)**
:
12-layer, 768-hidden, 12-heads, 12M parameters
*
**[`ALBERT V2 Large`](https://tfhub.dev/tensorflow/albert_en_large/1)**
:
24-layer, 1024-hidden, 16-heads, 18M parameters
*
**[`ALBERT V2 XLarge`](https://tfhub.dev/tensorflow/albert_en_xlarge/1)**
:
24-layer, 2048-hidden, 32-heads, 60M parameters
*
**[`ALBERT V2 XXLarge`](https://tfhub.dev/tensorflow/albert_en_xxlarge/1)**
:
12-layer, 4096-hidden, 64-heads, 235M parameters
## Set Up
```
shell
export
PYTHONPATH
=
"
$PYTHONPATH
:/path/to/models"
```
Install
`tf-nightly`
to get latest updates:
```
shell
pip
install
tf-nightly-gpu
```
With TPU, GPU support is not necessary. First, you need to create a
`tf-nightly`
TPU with
[
ctpu tool
](
https://github.com/tensorflow/tpu/tree/master/tools/ctpu
)
:
```
shell
ctpu up
-name
<instance name>
--tf-version
=
”nightly”
```
Second, you need to install TF 2
`tf-nightly`
on your VM:
```
shell
pip
install
tf-nightly
```
Warning: More details TPU-specific set-up instructions and tutorial should come
along with official TF 2.x release for TPU. Note that this repo is not
officially supported by Google Cloud TPU team yet until TF 2.1 released.
## Process Datasets
### Pre-training
Pre-train ALBERT using TF2.x will come soon.
For now, please use
[
ALBERT research repo
](
https://github.com/google-research/ALBERT
)
to pretrain the model and convert the checkpoint to TF2.x compatible ones using
[
tf2_albert_encoder_checkpoint_converter.py
](
tf2_albert_encoder_checkpoint_converter.py
)
.
### Fine-tuning
To prepare the fine-tuning data for final model training, use the
[
`../data/create_finetuning_data.py`
](
../data/create_finetuning_data.py
)
script.
Note that different from BERT models that use word piece tokenzer,
ALBERT models employ sentence piece tokenizer. So the FLAG tokenizer_impl has
to be set to 'sentence_piece'.
Resulting datasets in
`tf_record`
format and training meta data should be later
passed to training or evaluation scripts. The task-specific arguments are
described in following sections:
*
GLUE
Users can download the
[
GLUE data
](
https://gluebenchmark.com/tasks
)
by running
[
this script
](
https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
)
and unpack it to some directory
`$GLUE_DIR`
.
```
shell
export
GLUE_DIR
=
~/glue
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
TASK_NAME
=
MNLI
export
OUTPUT_DIR
=
gs://some_bucket/datasets
python ../data/create_finetuning_data.py
\
--input_data_dir
=
${
GLUE_DIR
}
/
${
TASK_NAME
}
/
\
--sp_model_file
=
${
ALBERT_DIR
}
/30k-clean.model
\
--train_data_output_path
=
${
OUTPUT_DIR
}
/
${
TASK_NAME
}
_train.tf_record
\
--eval_data_output_path
=
${
OUTPUT_DIR
}
/
${
TASK_NAME
}
_eval.tf_record
\
--meta_data_file_path
=
${
OUTPUT_DIR
}
/
${
TASK_NAME
}
_meta_data
\
--fine_tuning_task_type
=
classification
--max_seq_length
=
128
\
--classification_task_name
=
${
TASK_NAME
}
\
--tokenization
=
SentencePiece
```
*
SQUAD
The
[
SQuAD website
](
https://rajpurkar.github.io/SQuAD-explorer/
)
contains
detailed information about the SQuAD datasets and evaluation.
The necessary files can be found here:
*
[
train-v1.1.json
](
https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
)
*
[
dev-v1.1.json
](
https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
)
*
[
evaluate-v1.1.py
](
https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
)
*
[
train-v2.0.json
](
https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
)
*
[
dev-v2.0.json
](
https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
)
*
[
evaluate-v2.0.py
](
https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
)
```
shell
export
SQUAD_DIR
=
~/squad
export
SQUAD_VERSION
=
v1.1
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
OUTPUT_DIR
=
gs://some_bucket/datasets
python ../data/create_finetuning_data.py
\
--squad_data_file
=
${
SQUAD_DIR
}
/train-
${
SQUAD_VERSION
}
.json
\
--sp_model_file
=
${
ALBERT_DIR
}
/30k-clean.model
\
--train_data_output_path
=
${
OUTPUT_DIR
}
/squad_
${
SQUAD_VERSION
}
_train.tf_record
\
--meta_data_file_path
=
${
OUTPUT_DIR
}
/squad_
${
SQUAD_VERSION
}
_meta_data
\
--fine_tuning_task_type
=
squad
--max_seq_length
=
384
\
--tokenization
=
SentencePiece
```
## Fine-tuning with ALBERT
### Cloud GPUs and TPUs
*
Cloud Storage
The unzipped pre-trained model files can also be found in the Google Cloud
Storage folder
`gs://cloud-tpu-checkpoints/albert/checkpoints`
. For example:
```
shell
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
MODEL_DIR
=
gs://some_bucket/my_output_dir
```
Currently, users are able to access to
`tf-nightly`
TPUs and the following TPU
script should run with
`tf-nightly`
.
*
GPU -> TPU
Just add the following flags to
`run_classifier.py`
or
`run_squad.py`
:
```
shell
--distribution_strategy
=
tpu
--tpu
=
grpc://
${
TPU_IP_ADDRESS
}
:8470
```
### Sentence and Sentence-pair Classification Tasks
This example code fine-tunes
`albert_v2_base`
on the Microsoft Research
Paraphrase Corpus (MRPC) corpus, which only contains 3,600 examples and can
fine-tune in a few minutes on most GPUs.
We use the
`albert_v2_base`
as an example throughout the
workflow.
```
shell
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
MODEL_DIR
=
gs://some_bucket/my_output_dir
export
GLUE_DIR
=
gs://some_bucket/datasets
export
TASK
=
MRPC
python run_classifier.py
\
--mode
=
'train_and_eval'
\
--input_meta_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_meta_data
\
--train_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_train.tf_record
\
--eval_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_eval.tf_record
\
--bert_config_file
=
${
ALBERT_DIR
}
/albert_config.json
\
--init_checkpoint
=
${
ALBERT_DIR
}
/bert_model.ckpt
\
--train_batch_size
=
4
\
--eval_batch_size
=
4
\
--steps_per_loop
=
1
\
--learning_rate
=
2e-5
\
--num_train_epochs
=
3
\
--model_dir
=
${
MODEL_DIR
}
\
--distribution_strategy
=
mirrored
```
Alternatively, instead of specifying
`init_checkpoint`
, you can specify
`hub_module_url`
to employ a pretraind BERT hub module, e.g.,
` --hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/1`
.
To use TPU, you only need to switch distribution strategy type to
`tpu`
with TPU
information and use remote storage for model checkpoints.
```
shell
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
TPU_IP_ADDRESS
=
'???'
export
MODEL_DIR
=
gs://some_bucket/my_output_dir
export
GLUE_DIR
=
gs://some_bucket/datasets
python run_classifier.py
\
--mode
=
'train_and_eval'
\
--input_meta_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_meta_data
\
--train_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_train.tf_record
\
--eval_data_path
=
${
GLUE_DIR
}
/
${
TASK
}
_eval.tf_record
\
--bert_config_file
=
$ALBERT_DIR
/albert_config.json
\
--init_checkpoint
=
$ALBERT_DIR
/bert_model.ckpt
\
--train_batch_size
=
32
\
--eval_batch_size
=
32
\
--learning_rate
=
2e-5
\
--num_train_epochs
=
3
\
--model_dir
=
${
MODEL_DIR
}
\
--distribution_strategy
=
tpu
\
--tpu
=
grpc://
${
TPU_IP_ADDRESS
}
:8470
```
### SQuAD 1.1
The Stanford Question Answering Dataset (SQuAD) is a popular question answering
benchmark dataset. See more in
[
SQuAD website
](
https://rajpurkar.github.io/SQuAD-explorer/
)
.
We use the
`albert_v2_base`
as an example throughout the
workflow.
```
shell
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
SQUAD_DIR
=
gs://some_bucket/datasets
export
MODEL_DIR
=
gs://some_bucket/my_output_dir
export
SQUAD_VERSION
=
v1.1
python run_squad.py
\
--input_meta_data_path
=
${
SQUAD_DIR
}
/squad_
${
SQUAD_VERSION
}
_meta_data
\
--train_data_path
=
${
SQUAD_DIR
}
/squad_
${
SQUAD_VERSION
}
_train.tf_record
\
--predict_file
=
${
SQUAD_DIR
}
/dev-v1.1.json
\
--sp_model_file
=
${
ALBERT_DIR
}
/30k-clean.model
\
--bert_config_file
=
$ALBERT_DIR
/albert_config.json
\
--init_checkpoint
=
$ALBERT_DIR
/bert_model.ckpt
\
--train_batch_size
=
4
\
--predict_batch_size
=
4
\
--learning_rate
=
8e-5
\
--num_train_epochs
=
2
\
--model_dir
=
${
MODEL_DIR
}
\
--distribution_strategy
=
mirrored
```
Similarily, you can replace
`init_checkpoint`
FLAGS with
`hub_module_url`
to
specify a hub module path.
To use TPU, you need switch distribution strategy type to
`tpu`
with TPU
information.
```
shell
export
ALBERT_DIR
=
gs://cloud-tpu-checkpoints/albert/checkpoints/albert_v2_base
export
TPU_IP_ADDRESS
=
'???'
export
MODEL_DIR
=
gs://some_bucket/my_output_dir
export
SQUAD_DIR
=
gs://some_bucket/datasets
export
SQUAD_VERSION
=
v1.1
python run_squad.py
\
--input_meta_data_path
=
${
SQUAD_DIR
}
/squad_
${
SQUAD_VERSION
}
_meta_data
\
--train_data_path
=
${
SQUAD_DIR
}
/squad_
${
SQUAD_VERSION
}
_train.tf_record
\
--predict_file
=
${
SQUAD_DIR
}
/dev-v1.1.json
\
--sp_model_file
=
${
ALBERT_DIR
}
/30k-clean.model
\
--bert_config_file
=
$ALBERT_DIR
/albert_config.json
\
--init_checkpoint
=
$ALBERT_DIR
/bert_model.ckpt
\
--train_batch_size
=
32
\
--learning_rate
=
8e-5
\
--num_train_epochs
=
2
\
--model_dir
=
${
MODEL_DIR
}
\
--distribution_strategy
=
tpu
\
--tpu
=
grpc://
${
TPU_IP_ADDRESS
}
:8470
```
The dev set predictions will be saved into a file called predictions.json in the
model_dir:
```
shell
python
$SQUAD_DIR
/evaluate-v1.1.py
$SQUAD_DIR
/dev-v1.1.json ./squad/predictions.json
```
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/__init__.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/configs.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The ALBERT configurations."""
import
six
from
official.nlp.bert
import
configs
class
AlbertConfig
(
configs
.
BertConfig
):
"""Configuration for `ALBERT`."""
def
__init__
(
self
,
num_hidden_groups
=
1
,
inner_group_num
=
1
,
**
kwargs
):
"""Constructs AlbertConfig.
Args:
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared. Note that this value and also the following
'inner_group_num' has to be 1 for now, because all released ALBERT
models set them to 1. We may support arbitary valid values in future.
inner_group_num: Number of inner repetition of attention and ffn.
**kwargs: The remaining arguments are the same as above 'BertConfig'.
"""
super
(
AlbertConfig
,
self
).
__init__
(
**
kwargs
)
# TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
# in the released ALBERT. Support other values in AlbertEncoder if needed.
if
inner_group_num
!=
1
or
num_hidden_groups
!=
1
:
raise
ValueError
(
"We only support 'inner_group_num' and "
"'num_hidden_groups' as 1."
)
@
classmethod
def
from_dict
(
cls
,
json_object
):
"""Constructs a `AlbertConfig` from a Python dictionary of parameters."""
config
=
AlbertConfig
(
vocab_size
=
None
)
for
(
key
,
value
)
in
six
.
iteritems
(
json_object
):
config
.
__dict__
[
key
]
=
value
return
config
TensorFlow2x/ComputeVision/Classification/models-master/official/nlp/albert/run_classifier.py
0 → 100644
View file @
a32ffa95
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ALBERT classification finetuning runner in tf2.x."""
import
json
import
os
# Import libraries
from
absl
import
app
from
absl
import
flags
from
absl
import
logging
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.nlp.albert
import
configs
as
albert_configs
from
official.nlp.bert
import
bert_models
from
official.nlp.bert
import
run_classifier
as
run_classifier_bert
FLAGS
=
flags
.
FLAGS
def
predict
(
strategy
,
albert_config
,
input_meta_data
,
predict_input_fn
):
"""Function outputs both the ground truth predictions as .tsv files."""
with
strategy
.
scope
():
classifier_model
=
bert_models
.
classifier_model
(
albert_config
,
input_meta_data
[
'num_labels'
])[
0
]
checkpoint
=
tf
.
train
.
Checkpoint
(
model
=
classifier_model
)
latest_checkpoint_file
=
(
FLAGS
.
predict_checkpoint_path
or
tf
.
train
.
latest_checkpoint
(
FLAGS
.
model_dir
))
assert
latest_checkpoint_file
logging
.
info
(
'Checkpoint file %s found and restoring from '
'checkpoint'
,
latest_checkpoint_file
)
checkpoint
.
restore
(
latest_checkpoint_file
).
assert_existing_objects_matched
()
preds
,
ground_truth
=
run_classifier_bert
.
get_predictions_and_labels
(
strategy
,
classifier_model
,
predict_input_fn
,
return_probs
=
True
)
output_predict_file
=
os
.
path
.
join
(
FLAGS
.
model_dir
,
'test_results.tsv'
)
with
tf
.
io
.
gfile
.
GFile
(
output_predict_file
,
'w'
)
as
writer
:
logging
.
info
(
'***** Predict results *****'
)
for
probabilities
in
preds
:
output_line
=
'
\t
'
.
join
(
str
(
class_probability
)
for
class_probability
in
probabilities
)
+
'
\n
'
writer
.
write
(
output_line
)
ground_truth_labels_file
=
os
.
path
.
join
(
FLAGS
.
model_dir
,
'output_labels.tsv'
)
with
tf
.
io
.
gfile
.
GFile
(
ground_truth_labels_file
,
'w'
)
as
writer
:
logging
.
info
(
'***** Ground truth results *****'
)
for
label
in
ground_truth
:
output_line
=
'
\t
'
.
join
(
str
(
label
))
+
'
\n
'
writer
.
write
(
output_line
)
return
def
main
(
_
):
with
tf
.
io
.
gfile
.
GFile
(
FLAGS
.
input_meta_data_path
,
'rb'
)
as
reader
:
input_meta_data
=
json
.
loads
(
reader
.
read
().
decode
(
'utf-8'
))
if
not
FLAGS
.
model_dir
:
FLAGS
.
model_dir
=
'/tmp/bert20/'
strategy
=
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
FLAGS
.
distribution_strategy
,
num_gpus
=
FLAGS
.
num_gpus
,
tpu_address
=
FLAGS
.
tpu
)
max_seq_length
=
input_meta_data
[
'max_seq_length'
]
train_input_fn
=
run_classifier_bert
.
get_dataset_fn
(
FLAGS
.
train_data_path
,
max_seq_length
,
FLAGS
.
train_batch_size
,
is_training
=
True
)
eval_input_fn
=
run_classifier_bert
.
get_dataset_fn
(
FLAGS
.
eval_data_path
,
max_seq_length
,
FLAGS
.
eval_batch_size
,
is_training
=
False
)
albert_config
=
albert_configs
.
AlbertConfig
.
from_json_file
(
FLAGS
.
bert_config_file
)
if
FLAGS
.
mode
==
'train_and_eval'
:
run_classifier_bert
.
run_bert
(
strategy
,
input_meta_data
,
albert_config
,
train_input_fn
,
eval_input_fn
)
elif
FLAGS
.
mode
==
'predict'
:
predict
(
strategy
,
albert_config
,
input_meta_data
,
eval_input_fn
)
else
:
raise
ValueError
(
'Unsupported mode is specified: %s'
%
FLAGS
.
mode
)
return
if
__name__
==
'__main__'
:
flags
.
mark_flag_as_required
(
'bert_config_file'
)
flags
.
mark_flag_as_required
(
'input_meta_data_path'
)
flags
.
mark_flag_as_required
(
'model_dir'
)
app
.
run
(
main
)
Prev
1
…
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment