Commit 3d61d6b3 authored by qianyj's avatar qianyj
Browse files

initial files for ResNet50

parent d3a70caf
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multitask Evaluator implementation.
The evaluator implements the Orbit `AbstractEvaluator` interface.
"""
from typing import Dict, List, Optional, Union
import gin
import orbit
import tensorflow as tf
from official.core import base_task
from official.core import train_utils
from official.modeling.multitask import base_model
@gin.configurable
class MultiTaskEvaluator(orbit.AbstractEvaluator):
"""Implements the common trainer shared for TensorFlow models."""
def __init__(
self,
eval_tasks: List[base_task.Task],
model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
global_step: Optional[tf.Variable] = None,
eval_steps: Optional[Dict[str, int]] = None,
checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
"""Initialize common trainer for TensorFlow models.
Args:
eval_tasks: A list of tasks to evaluate.
model: tf.keras.Model instance.
global_step: the global step variable.
eval_steps: a dictionary of steps to run eval keyed by task names.
checkpoint_exporter: an object that has the `maybe_export_checkpoint`
interface.
"""
# Gets the current distribution strategy. If not inside any strategy scope,
# it gets a single-replica no-op strategy.
self._strategy = tf.distribute.get_strategy()
self._tasks = eval_tasks
self._model = model
self._global_step = global_step or orbit.utils.create_global_step()
self._checkpoint_exporter = checkpoint_exporter
if hasattr(self.model, "checkpoint_items"):
checkpoint_items = self.model.checkpoint_items
else:
checkpoint_items = {}
self._checkpoint = tf.train.Checkpoint(
model=self.model,
global_step=self.global_step,
**checkpoint_items)
self._validation_losses = None
self._validation_metrics = None
# Builds per-task datasets.
self.eval_datasets = {}
self.eval_steps = eval_steps or {}
for task in self.tasks:
self.eval_datasets[task.name] = orbit.utils.make_distributed_dataset(
self.strategy, task.build_inputs, task.task_config.validation_data)
# Builds per-task validation loops.
def get_function(task_name, task):
task_metrics = self.validation_metrics[task_name]
task_loss = self.validation_losses[task_name]
if isinstance(self.model, base_model.MultiTaskBaseModel):
model = self.model.sub_tasks[task_name]
else:
model = self.model
def step_fn(inputs):
logs = task.validation_step(inputs, model=model, metrics=task_metrics)
task_loss.update_state(logs[task.loss])
return logs
@tf.function
def eval_step_fn(iterator):
distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
return tf.nest.map_structure(self.strategy.experimental_local_results,
distributed_outputs)
return orbit.utils.create_loop_fn(eval_step_fn)
self.task_fns = {
task.name: get_function(task.name, task) for task in self.tasks
}
@property
def strategy(self):
return self._strategy
@property
def tasks(self):
return self._tasks
@property
def model(self):
return self._model
@property
def global_step(self):
return self._global_step
@property
def validation_losses(self):
"""Accesses the validation loss metric object."""
if self._validation_losses is None:
# Builds the per-task metrics and losses.
self._validation_losses = {}
for task in self.tasks:
self._validation_losses[task.name] = tf.keras.metrics.Mean(
"validation_loss", dtype=tf.float32)
return self._validation_losses
@property
def validation_metrics(self):
"""Accesses all validation metric metric objects."""
if self._validation_metrics is None:
# Builds the per-task metrics and losses.
self._validation_metrics = {}
for task in self.tasks:
self._validation_metrics[task.name] = task.build_metrics(training=False)
return self._validation_metrics
@property
def checkpoint(self):
"""Accesses the training checkpoint."""
return self._checkpoint
def evaluate(self, num_steps: tf.Tensor):
"""Performs evaluation for each `EvalTask`."""
for metric in self.validation_losses.values():
metric.reset_states()
for metrics in self.validation_metrics.values():
for metric in metrics:
metric.reset_states()
results = {}
eval_iters = tf.nest.map_structure(iter, self.eval_datasets)
for task in self.tasks:
outputs = None
name = task.name
eval_iter = eval_iters[name]
task_eval_steps = self.eval_steps.get(name, None) or num_steps
outputs = self.task_fns[name](
eval_iter,
task_eval_steps,
state=outputs,
reduce_fn=task.aggregate_logs)
task_metrics = self.validation_metrics[name]
task_loss = self.validation_losses[name]
logs = {}
for metric in task_metrics + [task_loss]:
logs[metric.name] = metric.result()
if outputs:
metrics = task.reduce_aggregated_logs(
outputs, global_step=self.global_step)
logs.update(metrics)
results[name] = logs
if self._checkpoint_exporter:
self._checkpoint_exporter.maybe_export_checkpoint(
self.checkpoint, results, self.global_step.numpy())
return results
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for multitask.evaluator."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.core import base_task
from official.core import config_definitions as cfg
from official.modeling.multitask import evaluator
def all_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
mode="eager",
)
class MockModel(tf.keras.Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dense = tf.keras.layers.Dense(1)
def call(self, inputs):
print(inputs, type(inputs))
if "y" in inputs:
self.add_loss(tf.zeros((1,), dtype=tf.float32))
else:
self.add_loss(tf.ones((1,), dtype=tf.float32))
return self.dense(inputs["x"])
class MockTask(base_task.Task):
"""Mock task object for testing."""
def build_metrics(self, training: bool = True):
del training
return [tf.keras.metrics.Accuracy(name="acc")]
def build_inputs(self, params):
def generate_data(_):
x = tf.zeros(shape=(2,), dtype=tf.float32)
label = tf.zeros([1], dtype=tf.int32)
if self.name == "bar":
return dict(x=x, y=x), label
else:
return dict(x=x), label
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
logs = super().validation_step(inputs, model, metrics)
logs["counter"] = tf.ones((1,), dtype=tf.float32)
return logs
def aggregate_logs(self, state, step_outputs):
if state is None:
state = {}
for key, value in step_outputs.items():
if key not in state:
state[key] = []
state[key].append(
np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
return state
def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
for k, v in aggregated_logs.items():
aggregated_logs[k] = np.sum(np.stack(v, axis=0))
return aggregated_logs
class EvaluatorTest(tf.test.TestCase, parameterized.TestCase):
@combinations.generate(all_strategy_combinations())
def test_multitask_evaluator(self, distribution):
with distribution.scope():
tasks = [
MockTask(params=cfg.TaskConfig(), name="bar"),
MockTask(params=cfg.TaskConfig(), name="foo")
]
model = MockModel()
test_evaluator = evaluator.MultiTaskEvaluator(
eval_tasks=tasks, model=model)
results = test_evaluator.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
self.assertContainsSubset(["validation_loss", "acc"], results["bar"].keys())
self.assertContainsSubset(["validation_loss", "acc"], results["foo"].keys())
self.assertEqual(results["bar"]["validation_loss"], 0.0)
self.assertEqual(results["foo"]["validation_loss"], 1.0)
@combinations.generate(all_strategy_combinations())
def test_multitask_evaluator_numpy_metrics(self, distribution):
with distribution.scope():
tasks = [
MockTask(params=cfg.TaskConfig(), name="bar"),
MockTask(params=cfg.TaskConfig(), name="foo")
]
model = MockModel()
test_evaluator = evaluator.MultiTaskEvaluator(
eval_tasks=tasks, model=model)
results = test_evaluator.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertEqual(results["bar"]["counter"],
5. * distribution.num_replicas_in_sync)
self.assertEqual(results["foo"]["counter"],
5. * distribution.num_replicas_in_sync)
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multitask trainer that interleaves each task's train step."""
from typing import Union
import gin
import orbit
import tensorflow as tf
from official.modeling.multitask import base_model
from official.modeling.multitask import base_trainer
from official.modeling.multitask import multitask
from official.modeling.multitask import task_sampler as sampler
@gin.configurable
class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
"""MultiTask trainer that interleaves task update."""
def __init__(self,
multi_task: multitask.MultiTask,
multi_task_model: Union[tf.keras.Model,
base_model.MultiTaskBaseModel],
optimizer: tf.optimizers.Optimizer,
task_sampler: sampler.TaskSampler,
trainer_options=None):
super().__init__(
multi_task=multi_task,
multi_task_model=multi_task_model,
optimizer=optimizer,
trainer_options=trainer_options)
self._task_sampler = task_sampler
# Build per task train step.
def _get_task_step(task_name, task):
def step_fn(inputs):
if isinstance(self.multi_task_model, base_model.MultiTaskBaseModel):
task_model = self.multi_task_model.sub_tasks[task_name]
else:
task_model = self.multi_task_model
task_logs = task.train_step(
inputs,
model=task_model,
optimizer=self.optimizer,
metrics=self.training_metrics[task_name])
self.training_losses[task_name].update_state(task_logs[task.loss])
return step_fn
self._task_train_step_map = {
name: _get_task_step(name, task)
for name, task in self.multi_task.tasks.items()
}
# TODO(haozhangthu): Add taskwise step counter to train_loop_end for logging
# on TensorBoard.
self._task_step_counters = {
name: orbit.utils.create_global_step() for name in self.multi_task.tasks
}
def task_step_counter(self, name):
return self._task_step_counters[name]
def train_step(self, iterator_map):
# Sample one task to train according to a multinomial distribution
rn = tf.random.stateless_uniform(shape=[], seed=(0, self.global_step))
cumulative_sample_distribution = self._task_sampler.task_cumulative_distribution(
self.global_step)
# Prepend a [0.0] for indexing convenience.
cumulative_sample_distribution = tf.concat(
[tf.constant([0.0], dtype=tf.float32), cumulative_sample_distribution],
axis=0)
for idx, (name, _) in enumerate(self.multi_task.tasks.items()):
begin = cumulative_sample_distribution[idx]
end = cumulative_sample_distribution[idx + 1]
if rn >= begin and rn < end:
self._strategy.run(
self._task_train_step_map[name], args=(next(iterator_map[name]),))
self.global_step.assign_add(1)
self.task_step_counter(name).assign_add(1)
def train_loop_end(self):
"""Record loss and metric values per task."""
result = super().train_loop_end()
# Interleaving training does not have a good semantic for `total_loss`. In
# fact, it is always zero. To avoid confusion, we filter the `total_loss`
# from the result logs.
if 'total_loss' in result:
result.pop('total_loss')
return result
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for multitask.interleaving_trainer."""
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.modeling.multitask import configs
from official.modeling.multitask import interleaving_trainer
from official.modeling.multitask import multitask
from official.modeling.multitask import task_sampler
from official.modeling.multitask import test_utils
def all_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
mode="eager",
)
class InterleavingTrainerTest(tf.test.TestCase, parameterized.TestCase):
@combinations.generate(all_strategy_combinations())
def test_multitask_interleaving_trainer(self, distribution):
with distribution.scope():
tasks = [
test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
]
test_multitask = multitask.MultiTask(tasks=tasks)
test_optimizer = tf.keras.optimizers.SGD(0.1)
model = test_utils.MockMultiTaskModel()
sampler = task_sampler.UniformTaskSampler(
task_weights=test_multitask.task_weights)
test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
multi_task=test_multitask,
multi_task_model=model,
optimizer=test_optimizer,
task_sampler=sampler)
results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
self.assertContainsSubset(["training_loss", "bar_acc"],
results["bar"].keys())
self.assertContainsSubset(["training_loss", "foo_acc"],
results["foo"].keys())
self.assertNotIn("total_loss", results)
@combinations.generate(all_strategy_combinations())
def test_trainer_with_configs(self, distribution):
config = configs.MultiTaskConfig(
task_routines=(configs.TaskRoutine(
task_name="foo",
task_config=test_utils.FooConfig(),
task_weight=3.0),
configs.TaskRoutine(
task_name="bar",
task_config=test_utils.BarConfig(),
task_weight=1.0)))
with distribution.scope():
test_multitask = multitask.MultiTask.from_config(config)
test_optimizer = tf.keras.optimizers.SGD(0.1)
model = test_utils.MockMultiTaskModel()
num_step = 1000
sampler = task_sampler.AnnealingTaskSampler(
task_weights=test_multitask.task_weights,
steps_per_epoch=num_step/5,
total_steps=num_step)
test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
multi_task=test_multitask,
multi_task_model=model,
optimizer=test_optimizer,
task_sampler=sampler)
results = test_trainer.train(tf.convert_to_tensor(num_step, dtype=tf.int32))
self.assertContainsSubset(["training_loss", "bar_acc"],
results["bar"].keys())
self.assertContainsSubset(["training_loss", "foo_acc"],
results["foo"].keys())
self.assertEqual(test_trainer.global_step.numpy(), num_step)
bar_sampled_step = test_trainer.task_step_counter("bar").numpy()
foo_sampled_step = test_trainer.task_step_counter("foo").numpy()
self.assertEqual(bar_sampled_step + foo_sampled_step, num_step)
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Experimental MultiTask base class for multi-task training/evaluation."""
import abc
from typing import Dict, List, Optional, Text, Union
import tensorflow as tf
from official.core import base_task
from official.core import config_definitions
from official.core import task_factory
from official.modeling import optimization
from official.modeling.multitask import base_model
from official.modeling.multitask import configs
OptimizationConfig = optimization.OptimizationConfig
RuntimeConfig = config_definitions.RuntimeConfig
class MultiTask(tf.Module, metaclass=abc.ABCMeta):
"""A multi-task class to manage multiple tasks."""
def __init__(self,
tasks: Union[Dict[Text, base_task.Task], List[base_task.Task]],
task_weights: Optional[Dict[str, Union[float, int]]] = None,
task_eval_steps: Optional[Dict[str, int]] = None,
name: Optional[str] = None):
"""MultiTask initialization.
Args:
tasks: a list or a flat dict of Task.
task_weights: a dict of (task, task weight), task weight can be applied
directly during loss summation in a joint backward step, or it can be
used to sample task among interleaved backward step.
task_eval_steps: a dict of (task, eval steps).
name: the instance name of a MultiTask object.
"""
super().__init__(name=name)
if isinstance(tasks, list):
self._tasks = {}
for task in tasks:
if task.name in self._tasks:
raise ValueError("Duplicated tasks found, task.name is %s" %
task.name)
self._tasks[task.name] = task
elif isinstance(tasks, dict):
self._tasks = tasks
else:
raise ValueError("The tasks argument has an invalid type: %s" %
type(tasks))
self.task_eval_steps = task_eval_steps or {}
self._task_weights = task_weights or {}
self._task_weights = dict([
(name, self._task_weights.get(name, 1.0)) for name in self.tasks
])
@classmethod
def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
tasks = {}
task_eval_steps = {}
task_weights = {}
for task_routine in config.task_routines:
task_name = task_routine.task_name or task_routine.task_config.name
tasks[task_name] = task_factory.get_task(
task_routine.task_config, logging_dir=logging_dir, name=task_name)
task_eval_steps[task_name] = task_routine.eval_steps
task_weights[task_name] = task_routine.task_weight
return cls(
tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
@property
def tasks(self):
return self._tasks
def task_weight(self, task_name):
return self._task_weights[task_name]
@property
def task_weights(self):
return self._task_weights
@classmethod
def create_optimizer(cls,
optimizer_config: OptimizationConfig,
runtime_config: Optional[RuntimeConfig] = None):
return base_task.Task.create_optimizer(
optimizer_config=optimizer_config, runtime_config=runtime_config)
def joint_train_step(self, task_inputs,
multi_task_model: base_model.MultiTaskBaseModel,
optimizer: tf.keras.optimizers.Optimizer, task_metrics,
**kwargs):
"""The joint train step.
Args:
task_inputs: a dictionary of task names and per-task features.
multi_task_model: a MultiTaskBaseModel instance.
optimizer: a tf.optimizers.Optimizer.
task_metrics: a dictionary of task names and per-task metrics.
**kwargs: other arguments to pass through.
Returns:
A dictionary of losses, inculding per-task losses and their weighted sum.
"""
losses = {}
with tf.GradientTape() as tape:
total_loss = 0.0
for name, model in multi_task_model.sub_tasks.items():
inputs = task_inputs[name]
if isinstance(inputs, tuple) and len(inputs) == 2:
features, labels = inputs
elif isinstance(inputs, dict):
features, labels = inputs, inputs
else:
raise ValueError("The iterator output is neither a tuple nor a "
"dictionary. It is not implemented to support "
"such outputs.")
outputs = model(features, training=True)
task_loss = self.tasks[name].build_losses(labels, outputs)
task_weight = self.task_weight(name)
total_loss += task_weight * task_loss
losses[name] = task_loss
self.tasks[name].process_metrics(task_metrics[name], labels, outputs,
**kwargs)
# Scales loss as the default gradients allreduce performs sum inside
# the optimizer.
scaled_loss = total_loss / tf.distribute.get_strategy(
).num_replicas_in_sync
tvars = multi_task_model.trainable_variables
grads = tape.gradient(scaled_loss, tvars)
optimizer.apply_gradients(list(zip(grads, tvars)))
losses["total_loss"] = total_loss
return losses
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utils to sample tasks for interleaved optimization."""
import abc
from typing import Union, Dict, Text
import tensorflow as tf
from official.modeling.multitask import configs
class TaskSampler(tf.Module, metaclass=abc.ABCMeta):
"""An abstract class defining task sampling API for interleaving trainer."""
def __init__(self, task_weights: Dict[Text, Union[float, int]]):
self._task_weights = task_weights
@property
def task_weights(self):
return self._task_weights
@abc.abstractmethod
def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
"""Compute cumulative distribution to sample tasks.
It calculates the cumulative distribution of the multinomial task
distribution with respect to which to be sampled against.
Args:
global_step: A tensor indicating current progess of training.
Returns:
A float tensor with shape (#(task), 1) that represents the cumulative
sampling distribution.
"""
pass
class UniformTaskSampler(TaskSampler):
"""Sample all tasks uniformly."""
def __init__(self, task_weights: Dict[Text, Union[float, int]]):
super(UniformTaskSampler, self).__init__(task_weights=task_weights)
self._uniform_cumulative = tf.math.cumsum(
tf.constant(
[1.0 / len(self._task_weights)] * len(self._task_weights),
dtype=tf.float32))
def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
del global_step
return self._uniform_cumulative
class ProportionalTaskSampler(TaskSampler):
"""Sample tasks proportional to task weights."""
def __init__(self,
task_weights: Dict[Text, Union[float, int]],
alpha: float = 1.0):
super(ProportionalTaskSampler, self).__init__(task_weights=task_weights)
self._alpha = tf.cast(alpha, dtype=tf.float32)
task_weight_dict_ordered_list = tf.constant(
[weight for _, weight in self._task_weights.items()], dtype=tf.float32)
task_sizes = tf.math.pow(task_weight_dict_ordered_list, self._alpha)
task_distribution = task_sizes / tf.reduce_sum(task_sizes)
self._porportional_cumulative = tf.math.cumsum(task_distribution)
def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
del global_step
return self._porportional_cumulative
class AnnealingTaskSampler(TaskSampler):
"""Sample tasks according to task weights as well as training progress.
See http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf
"""
def __init__(self,
task_weights: Dict[Text, Union[float, int]],
steps_per_epoch: int,
total_steps: int):
super(AnnealingTaskSampler, self).__init__(task_weights=task_weights)
self._steps_per_epoch = tf.cast(steps_per_epoch, dtype=tf.float32)
self._total_epochs = tf.cast(
total_steps / self._steps_per_epoch, dtype=tf.float32)
def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
cur_epoch = tf.math.floor(
tf.cast(global_step, dtype=tf.float32) / self._steps_per_epoch)
alpha = 1.0 - 0.8 * (cur_epoch - 1) / (self._total_epochs - 1 + 1e-10)
task_weight_dict_ordered_list = [
weight for _, weight in self._task_weights.items()
]
task_sizes = tf.math.pow(
tf.constant(task_weight_dict_ordered_list, dtype=tf.float32),
tf.cast(alpha, dtype=tf.float32))
dynamic_task_distribution = task_sizes / tf.reduce_sum(task_sizes)
return tf.math.cumsum(dynamic_task_distribution)
def get_task_sampler(config: configs.TaskSamplingConfig,
task_weights: Dict[Text, float]) -> TaskSampler:
"""Utils to create task sampler with configuration and task weights."""
oneof_config = config.get()
if config.type == 'uniform':
return UniformTaskSampler(task_weights=task_weights)
elif config.type == 'proportional':
return ProportionalTaskSampler(
task_weights=task_weights, alpha=oneof_config.alpha)
elif config.type == 'annealing':
return AnnealingTaskSampler(
task_weights=task_weights,
steps_per_epoch=oneof_config.steps_per_epoch,
total_steps=oneof_config.total_steps)
else:
raise RuntimeError('Task sampler type not supported')
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for multitask.task_sampler."""
import tensorflow as tf
from official.modeling.multitask import configs
from official.modeling.multitask import task_sampler as sampler
class TaskSamplerTest(tf.test.TestCase):
def setUp(self):
super(TaskSamplerTest, self).setUp()
self._task_weights = {'A': 1.0, 'B': 2.0, 'C': 3.0}
def test_uniform_sample_distribution(self):
uniform_sampler = sampler.get_task_sampler(
configs.TaskSamplingConfig(type='uniform'), self._task_weights)
for step in range(5):
cumulative_distribution = uniform_sampler.task_cumulative_distribution(
tf.constant(step, dtype=tf.int64))
self.assertAllClose([0.333333, 0.666666, 1.0],
cumulative_distribution.numpy())
def test_proportional_sample_distribution(self):
prop_sampler = sampler.get_task_sampler(
configs.TaskSamplingConfig(
type='proportional',
proportional=configs.ProportionalSampleConfig(alpha=2.0)),
self._task_weights)
# CucmulativeOf(Normalize([1.0^2, 2.0^2, 3.0^2]))
for step in range(5):
cumulative_distribution = prop_sampler.task_cumulative_distribution(
tf.constant(step, dtype=tf.int64))
self.assertAllClose([0.07142857, 0.35714286, 1.0],
cumulative_distribution.numpy())
def test_annealing_sample_distribution(self):
num_epoch = 3
step_per_epoch = 6
annel_sampler = sampler.get_task_sampler(
configs.TaskSamplingConfig(
type='annealing',
annealing=configs.AnnealingSampleConfig(
steps_per_epoch=step_per_epoch,
total_steps=step_per_epoch * num_epoch)), self._task_weights)
global_step = tf.Variable(
0, dtype=tf.int64, name='global_step', trainable=False)
expected_cumulative_epochs = [[0.12056106, 0.4387236, 1.0],
[0.16666667, 0.5, 1.0],
[0.22477472, 0.5654695, 1.0]]
for epoch in range(num_epoch):
for _ in range(step_per_epoch):
cumulative_distribution = annel_sampler.task_cumulative_distribution(
tf.constant(global_step, dtype=tf.int64))
global_step.assign_add(1)
self.assertAllClose(expected_cumulative_epochs[epoch],
cumulative_distribution.numpy())
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing utils for mock models and tasks."""
from typing import Dict, Text
import tensorflow as tf
from official.core import base_task
from official.core import config_definitions as cfg
from official.core import task_factory
from official.modeling.multitask import base_model
class MockFooModel(tf.keras.Model):
"""A mock model can consume 'foo' and 'bar' inputs."""
def __init__(self, shared_layer, *args, **kwargs):
super().__init__(*args, **kwargs)
self._share_layer = shared_layer
self._foo_specific_layer = tf.keras.layers.Dense(1)
def call(self, inputs):
self.add_loss(tf.zeros((1,), dtype=tf.float32))
if "foo" in inputs:
input_tensor = inputs["foo"]
else:
input_tensor = inputs["bar"]
return self._foo_specific_layer(self._share_layer(input_tensor))
class MockBarModel(tf.keras.Model):
def __init__(self, shared_layer, *args, **kwargs):
super().__init__(*args, **kwargs)
self._share_layer = shared_layer
self._bar_specific_layer = tf.keras.layers.Dense(1)
def call(self, inputs):
self.add_loss(tf.zeros((2,), dtype=tf.float32))
return self._bar_specific_layer(self._share_layer(inputs["bar"]))
class MockMultiTaskModel(base_model.MultiTaskBaseModel):
def __init__(self, *args, **kwargs):
self._shared_dense = tf.keras.layers.Dense(1)
super().__init__(*args, **kwargs)
def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
return {
"foo": MockFooModel(self._shared_dense),
"bar": MockBarModel(self._shared_dense)
}
def mock_data(feature_name):
"""Mock dataset function."""
def _generate_data(_):
x = tf.zeros(shape=(2,), dtype=tf.float32)
label = tf.zeros([1], dtype=tf.int32)
return {feature_name: x}, label
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
_generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
class FooConfig(cfg.TaskConfig):
pass
class BarConfig(cfg.TaskConfig):
pass
@task_factory.register_task_cls(FooConfig)
class MockFooTask(base_task.Task):
"""Mock foo task object for testing."""
def build_metrics(self, training: bool = True):
del training
return [tf.keras.metrics.Accuracy(name="foo_acc")]
def build_inputs(self, params):
return mock_data("foo")
def build_model(self) -> tf.keras.Model:
return MockFooModel(shared_layer=tf.keras.layers.Dense(1))
def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
if aux_losses:
loss += tf.add_n(aux_losses)
return tf.reduce_mean(loss)
@task_factory.register_task_cls(BarConfig)
class MockBarTask(base_task.Task):
"""Mock bar task object for testing."""
def build_metrics(self, training: bool = True):
del training
return [tf.keras.metrics.Accuracy(name="bar_acc")]
def build_inputs(self, params):
return mock_data("bar")
def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
if aux_losses:
loss += tf.add_n(aux_losses)
return tf.reduce_mean(loss)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multitask training driver library."""
# pytype: disable=attribute-error
import os
from typing import Any, List, Optional, Tuple
from absl import logging
import orbit
import tensorflow as tf
from official.core import base_task
from official.core import base_trainer as core_lib
from official.core import train_utils
from official.modeling.multitask import base_model
from official.modeling.multitask import base_trainer
from official.modeling.multitask import configs
from official.modeling.multitask import evaluator as evaluator_lib
from official.modeling.multitask import interleaving_trainer
from official.modeling.multitask import multitask
from official.modeling.multitask import task_sampler
TRAINERS = {
'interleaving': interleaving_trainer.MultiTaskInterleavingTrainer,
'joint': base_trainer.MultiTaskBaseTrainer
}
def run_experiment(
*,
distribution_strategy: tf.distribute.Strategy,
task: multitask.MultiTask,
model: base_model.MultiTaskBaseModel,
mode: str,
params: configs.MultiTaskExperimentConfig,
model_dir: str,
trainer: base_trainer.MultiTaskBaseTrainer = None
) -> base_model.MultiTaskBaseModel:
"""Runs train/eval configured by the experiment params.
Args:
distribution_strategy: A distribution distribution_strategy.
task: A MultiTaskTask instance.
model: A MultiTaskBaseModel instance.
mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
or 'continuous_eval'.
params: ExperimentConfig instance.
model_dir: A 'str', a path to store model checkpoints and summaries.
trainer: (optional) A multi-task trainer to use. If none is provided, a
default one will be created based on `params`.
Returns:
model: `base_model.MultiTaskBaseModel` instance.
"""
is_training = 'train' in mode
is_eval = 'eval' in mode
with distribution_strategy.scope():
optimizer = task.create_optimizer(params.trainer.optimizer_config,
params.runtime)
kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
if params.trainer.trainer_type == 'interleaving':
sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
task.task_weights)
kwargs.update(dict(task_sampler=sampler))
if trainer is None:
trainer = TRAINERS[params.trainer.trainer_type](
**kwargs) if is_training else None
if is_eval:
eval_steps = task.task_eval_steps
evaluator = evaluator_lib.MultiTaskEvaluator(
eval_tasks=task.tasks.values(),
model=model,
eval_steps=eval_steps,
global_step=trainer.global_step if is_training else None,
checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
params, model_dir))
else:
evaluator = None
if trainer:
checkpoint = trainer.checkpoint
global_step = trainer.global_step
else:
checkpoint = evaluator.checkpoint
global_step = evaluator.global_step
# TODO(hongkuny,haozhangthu): Revisit initialization method.
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
directory=model_dir,
max_to_keep=params.trainer.max_to_keep,
step_counter=global_step,
checkpoint_interval=params.trainer.checkpoint_interval,
init_fn=model.initialize)
controller = orbit.Controller(
strategy=distribution_strategy,
trainer=trainer,
evaluator=evaluator,
global_step=global_step,
steps_per_loop=params.trainer.steps_per_loop,
checkpoint_manager=checkpoint_manager,
summary_dir=os.path.join(model_dir, 'train'),
eval_summary_dir=os.path.join(model_dir, 'validation'),
summary_interval=params.trainer.summary_interval)
logging.info('Starts to execute mode: %s', mode)
with distribution_strategy.scope():
if mode == 'train':
controller.train(steps=params.trainer.train_steps)
elif mode == 'train_and_eval':
controller.train_and_evaluate(
train_steps=params.trainer.train_steps,
eval_steps=params.trainer.validation_steps,
eval_interval=params.trainer.validation_interval)
elif mode == 'eval':
controller.evaluate(steps=params.trainer.validation_steps)
elif mode == 'continuous_eval':
def timeout_fn():
if evaluator.global_step.numpy() >= params.trainer.train_steps:
return True
return False
controller.evaluate_continuously(
steps=params.trainer.validation_steps,
timeout=params.trainer.continuous_eval_timeout,
timeout_fn=timeout_fn)
else:
raise NotImplementedError('The mode is not implemented: %s' % mode)
return model
def run_experiment_with_multitask_eval(
*,
distribution_strategy: tf.distribute.Strategy,
train_task: base_task.Task,
eval_tasks: List[base_task.Task],
mode: str,
params: configs.MultiEvalExperimentConfig,
model_dir: str,
run_post_eval: bool = False,
save_summary: bool = True,
trainer: Optional[core_lib.Trainer] = None) -> Tuple[Any, Any]:
"""Runs train/eval configured by the experiment params.
Args:
distribution_strategy: A distribution distribution_strategy.
train_task: A base_task.Task instance.
eval_tasks: A list of evaluation tasks.
mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
or 'continuous_eval'.
params: MultiEvalExperimentConfig instance.
model_dir: A 'str', a path to store model checkpoints and summaries.
run_post_eval: Whether to run post eval once after training, metrics logs
are returned.
save_summary: Whether to save train and validation summary.
trainer: the core_lib.Trainer instance. It should be created within the
strategy.scope(). If not provided, an instance will be created by default
if `mode` contains 'train'.
Returns:
model: `tf.keras.Model` instance.
"""
is_training = 'train' in mode
is_eval = 'eval' in mode
with distribution_strategy.scope():
if is_training:
trainer = trainer or core_lib.Trainer(
config=params,
task=train_task,
model=train_task.build_model(),
optimizer=train_task.create_optimizer(params.trainer.optimizer_config,
params.runtime),
train=True,
evaluate=False)
else:
trainer = None
model = trainer.model if trainer else train_task.build_model()
if is_eval:
eval_steps = dict([(task_routine.task_config.name,
task_routine.eval_steps)
for task_routine in params.eval_tasks])
evaluator = evaluator_lib.MultiTaskEvaluator(
eval_tasks=eval_tasks,
model=model,
global_step=trainer.global_step if is_training else None,
eval_steps=eval_steps,
checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
params, model_dir))
else:
evaluator = None
if trainer:
checkpoint = trainer.checkpoint
global_step = trainer.global_step
else:
checkpoint = evaluator.checkpoint
global_step = evaluator.global_step
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
directory=model_dir,
max_to_keep=params.trainer.max_to_keep,
step_counter=global_step,
checkpoint_interval=params.trainer.checkpoint_interval,
init_fn=trainer.initialize if trainer else None)
controller = orbit.Controller(
strategy=distribution_strategy,
trainer=trainer,
evaluator=evaluator,
global_step=global_step,
steps_per_loop=params.trainer.steps_per_loop,
checkpoint_manager=checkpoint_manager,
summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
eval_summary_dir=os.path.join(model_dir, 'validation') if
(save_summary) else None,
summary_interval=params.trainer.summary_interval if
(save_summary) else None)
logging.info('Starts to execute mode: %s', mode)
with distribution_strategy.scope():
if mode == 'train':
controller.train(steps=params.trainer.train_steps)
elif mode == 'train_and_eval':
controller.train_and_evaluate(
train_steps=params.trainer.train_steps,
eval_steps=params.trainer.validation_steps,
eval_interval=params.trainer.validation_interval)
elif mode == 'eval':
controller.evaluate(steps=params.trainer.validation_steps)
elif mode == 'continuous_eval':
def timeout_fn():
if evaluator.global_step.numpy() >= params.trainer.train_steps:
return True
return False
controller.evaluate_continuously(
steps=params.trainer.validation_steps,
timeout=params.trainer.continuous_eval_timeout,
timeout_fn=timeout_fn)
else:
raise NotImplementedError('The mode is not implemented: %s' % mode)
if run_post_eval:
return model, evaluator.evaluate(
tf.convert_to_tensor(params.trainer.validation_steps)) # pytype: disable=bad-return-type # typed-keras
else:
return model, {} # pytype: disable=bad-return-type # typed-keras
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for multitask.train_lib."""
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.core import task_factory
from official.modeling.hyperparams import params_dict
from official.modeling.multitask import configs
from official.modeling.multitask import multitask
from official.modeling.multitask import test_utils
from official.modeling.multitask import train_lib
class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super().setUp()
self._test_config = {
'trainer': {
'checkpoint_interval': 10,
'steps_per_loop': 10,
'summary_interval': 10,
'train_steps': 10,
'validation_steps': 5,
'validation_interval': 10,
'continuous_eval_timeout': 1,
'optimizer_config': {
'optimizer': {
'type': 'sgd',
},
'learning_rate': {
'type': 'constant'
}
}
},
}
@combinations.generate(
combinations.combine(
distribution_strategy=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
mode='eager',
flag_mode=['train', 'eval', 'train_and_eval']))
def test_end_to_end(self, distribution_strategy, flag_mode):
model_dir = self.get_temp_dir()
experiment_config = configs.MultiTaskExperimentConfig(
task=configs.MultiTaskConfig(
task_routines=(
configs.TaskRoutine(
task_name='foo', task_config=test_utils.FooConfig()),
configs.TaskRoutine(
task_name='bar', task_config=test_utils.BarConfig()))))
experiment_config = params_dict.override_params_dict(
experiment_config, self._test_config, is_strict=False)
with distribution_strategy.scope():
test_multitask = multitask.MultiTask.from_config(experiment_config.task)
model = test_utils.MockMultiTaskModel()
train_lib.run_experiment(
distribution_strategy=distribution_strategy,
task=test_multitask,
model=model,
mode=flag_mode,
params=experiment_config,
model_dir=model_dir)
@combinations.generate(
combinations.combine(
distribution_strategy=[
strategy_combinations.default_strategy,
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
mode='eager',
flag_mode=['train', 'eval', 'train_and_eval']))
def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
model_dir = self.get_temp_dir()
experiment_config = configs.MultiEvalExperimentConfig(
task=test_utils.FooConfig(),
eval_tasks=(configs.TaskRoutine(
task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2),
configs.TaskRoutine(
task_name='bar',
task_config=test_utils.BarConfig(),
eval_steps=3)))
experiment_config = params_dict.override_params_dict(
experiment_config, self._test_config, is_strict=False)
with distribution_strategy.scope():
train_task = task_factory.get_task(experiment_config.task)
eval_tasks = [
task_factory.get_task(config.task_config, name=config.task_name)
for config in experiment_config.eval_tasks
]
train_lib.run_experiment_with_multitask_eval(
distribution_strategy=distribution_strategy,
train_task=train_task,
eval_tasks=eval_tasks,
mode=flag_mode,
params=experiment_config,
model_dir=model_dir)
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization package definition."""
# pylint: disable=wildcard-import
from official.modeling.optimization.configs.learning_rate_config import *
from official.modeling.optimization.configs.optimization_config import *
from official.modeling.optimization.configs.optimizer_config import *
from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
from official.modeling.optimization.lr_schedule import *
from official.modeling.optimization.optimizer_factory import OptimizerFactory
from official.modeling.optimization.optimizer_factory import register_optimizer_cls
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Adafactor optimizer.
A new optimizer that will be open sourced soon.
"""
# pylint: disable=invalid-name, represents an unimplemented class definition.
Adafactor = "Unimplemented"
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for learning rate schedule config."""
from typing import List, Optional
import dataclasses
from official.modeling.hyperparams import base_config
@dataclasses.dataclass
class ConstantLrConfig(base_config.Config):
"""Configuration for constant learning rate.
This class is a containers for the constant learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to Constant.
learning_rate: A float. The learning rate. Defaults to 0.1.
"""
name: str = 'Constant'
learning_rate: float = 0.1
@dataclasses.dataclass
class StepwiseLrConfig(base_config.Config):
"""Configuration for stepwise learning rate decay.
This class is a container for the piecewise constant learning rate scheduling
configs. It will configure an instance of PiecewiseConstantDecay keras
learning rate schedule.
An example (from keras docs): use a learning rate that's 1.0 for the first
100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
```python
boundaries: [100000, 110000]
values: [1.0, 0.5, 0.1]
Attributes:
name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
boundaries: A list of ints of strictly increasing entries. Defaults to None.
values: A list of floats that specifies the values for the intervals defined
by `boundaries`. It should have one more element than `boundaries`.
The learning rate is computed as follows: [0, boundaries[0]] ->
values[0] [boundaries[0], boundaries[1]] -> values[1]
[boundaries[n-1], boundaries[n]] -> values[n] [boundaries[n],
end] -> values[n+1] Defaults to None.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'PiecewiseConstantDecay'
boundaries: Optional[List[int]] = None
values: Optional[List[float]] = None
offset: int = 0
@dataclasses.dataclass
class ExponentialLrConfig(base_config.Config):
"""Configuration for exponential learning rate decay.
This class is a containers for the exponential learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to ExponentialDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
decay_rate: A float. Defaults to None.
staircase: A boolean, if true, learning rate is decreased at discreate
intervals. Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'ExponentialDecay'
initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None
decay_rate: Optional[float] = None
staircase: Optional[bool] = None
offset: int = 0
@dataclasses.dataclass
class PolynomialLrConfig(base_config.Config):
"""Configuration for polynomial learning rate decay.
This class is a containers for the polynomial learning rate decay configs.
Attributes:
name: The name of the learning rate schedule. Defaults to PolynomialDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
end_learning_rate: A float. The minimal end learning rate.
power: A float. The power of the polynomial. Defaults to linear, 1.0.
cycle: A boolean, whether or not it should cycle beyond decay_steps.
Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'PolynomialDecay'
initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None
end_learning_rate: float = 0.0001
power: float = 1.0
cycle: bool = False
offset: int = 0
@dataclasses.dataclass
class CosineLrConfig(base_config.Config):
"""Configuration for Cosine learning rate decay.
This class is a containers for the cosine learning rate decay configs,
tf.keras.experimental.CosineDecay.
Attributes:
name: The name of the learning rate schedule. Defaults to CosineDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
decay_steps: A positive integer that is used for decay computation. Defaults
to None.
alpha: A float. Minimum learning rate value as a fraction of
initial_learning_rate.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'CosineDecay'
initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None
alpha: float = 0.0
offset: int = 0
@dataclasses.dataclass
class DirectPowerLrConfig(base_config.Config):
"""Configuration for DirectPower learning rate decay.
This class configures a schedule following follows lr * (step)^power.
Attributes:
name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
"""
name: str = 'DirectPowerDecay'
initial_learning_rate: Optional[float] = None
power: float = -0.5
@dataclasses.dataclass
class PowerAndLinearDecayLrConfig(base_config.Config):
"""Configuration for DirectPower learning rate decay.
The schedule has the following behavoir.
Let offset_step = step - offset.
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step >= total_decay_steps, the actual learning rate equals zero.
Attributes:
name: The name of the learning rate schedule. Defaults to
PowerAndLinearDecay.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
total_decay_steps: An int. The total number of steps for power + linear
decay. Defaults to None.
power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
decay.
linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
the learning rate will be multiplied by a linear decay. Defaults to 0.1.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'PowerAndLinearDecay'
initial_learning_rate: Optional[float] = None
total_decay_steps: Optional[int] = None
power: float = -0.5
linear_decay_fraction: float = 0.1
offset: int = 0
@dataclasses.dataclass
class PowerDecayWithOffsetLrConfig(base_config.Config):
"""Configuration for power learning rate decay with step offset.
Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
Otherwise, learning rate equals to lr * (step - offset)^power.
Attributes:
name: The name of the learning rate schedule. Defaults to
PowerDecayWithOffset.
initial_learning_rate: A float. The initial learning rate. Defaults to None.
power: A float. Defaults to -0.5, for sqrt decay.
offset: An integer. Power decay happens after `offset` steps.
pre_offset_learning_rate: A float. The constant learning rate before
`offset` steps.
"""
name: str = 'PowerDecayWithOffset'
initial_learning_rate: Optional[float] = None
power: float = -0.5
offset: int = 0
pre_offset_learning_rate: float = 1.0e6
@dataclasses.dataclass
class StepCosineLrConfig(base_config.Config):
"""Configuration for stepwise learning rate decay.
This class is a container for the piecewise cosine learning rate scheduling
configs. It will configure an instance of StepConsineDecayWithOffset keras
learning rate schedule.
```python
boundaries: [100000, 110000]
values: [1.0, 0.5]
lr_decayed_fn = (
lr_schedule.StepConsineDecayWithOffset(
boundaries,
values))
```
from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
Attributes:
name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
boundaries: A list of ints of strictly increasing entries. Defaults to None.
values: A list of floats that specifies the values for the intervals defined
by `boundaries`. It should have one more element than `boundaries`.
The learning rate is computed as follows:
[0, boundaries[0]] -> cosine from values[0] to values[1]
[boundaries[0], boundaries[1]] -> values[1] to values[2]
...
[boundaries[n-1], boundaries[n]] -> values[n] to values[n+1]
[boundaries[n], end] -> values[n+1] to 0.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'StepConsineDecayWithOffset'
boundaries: Optional[List[int]] = None
values: Optional[List[float]] = None
offset: int = 0
@dataclasses.dataclass
class LinearWarmupConfig(base_config.Config):
"""Configuration for linear warmup schedule config.
This class is a container for the linear warmup schedule configs.
Warmup_learning_rate is the initial learning rate, the final learning rate of
the warmup period is the learning_rate of the optimizer in use. The learning
rate at each step linearly increased according to the following formula:
warmup_learning_rate = warmup_learning_rate +
step / warmup_steps * (final_learning_rate - warmup_learning_rate).
Using warmup overrides the learning rate schedule by the number of warmup
steps.
Attributes:
name: The name of warmup schedule. Defaults to linear.
warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
warmup_steps: Warmup steps. Defaults to None.
"""
name: str = 'linear'
warmup_learning_rate: float = 0
warmup_steps: Optional[int] = None
@dataclasses.dataclass
class PolynomialWarmupConfig(base_config.Config):
"""Configuration for linear warmup schedule config.
This class is a container for the polynomial warmup schedule configs.
Attributes:
name: The name of warmup schedule. Defaults to Polynomial.
power: Polynomial power. Defaults to 1.
warmup_steps: Warmup steps. Defaults to None.
"""
name: str = 'polynomial'
power: float = 1
warmup_steps: Optional[int] = None
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for optimization configs.
This file define the dataclass for optimization configs (OptimizationConfig).
It also has two helper functions get_optimizer_config, and get_lr_config from
an OptimizationConfig class.
"""
from typing import Optional
import dataclasses
from official.modeling.hyperparams import base_config
from official.modeling.hyperparams import oneof
from official.modeling.optimization.configs import learning_rate_config as lr_cfg
from official.modeling.optimization.configs import optimizer_config as opt_cfg
@dataclasses.dataclass
class OptimizerConfig(oneof.OneOfConfig):
"""Configuration for optimizer.
Attributes:
type: 'str', type of optimizer to be used, on the of fields below.
sgd: sgd optimizer config.
adam: adam optimizer config.
adamw: adam with weight decay.
lamb: lamb optimizer.
rmsprop: rmsprop optimizer.
lars: lars optimizer.
adagrad: adagrad optimizer.
slide: slide optimizer.
"""
type: Optional[str] = None
sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()
@dataclasses.dataclass
class LrConfig(oneof.OneOfConfig):
"""Configuration for lr schedule.
Attributes:
type: 'str', type of lr schedule to be used, one of the fields below.
constant: constant learning rate config.
stepwise: stepwise learning rate config.
exponential: exponential learning rate config.
polynomial: polynomial learning rate config.
cosine: cosine learning rate config.
power: step^power learning rate config.
power_linear: learning rate config of step^power followed by
step^power*linear.
power_with_offset: power decay with a step offset.
step_cosine_with_offset: Step cosine with a step offset.
"""
type: Optional[str] = None
constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
lr_cfg.PowerAndLinearDecayLrConfig())
power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
lr_cfg.PowerDecayWithOffsetLrConfig())
step_cosine_with_offset: lr_cfg.StepCosineLrConfig = (
lr_cfg.StepCosineLrConfig())
@dataclasses.dataclass
class WarmupConfig(oneof.OneOfConfig):
"""Configuration for lr schedule.
Attributes:
type: 'str', type of warmup schedule to be used, one of the fields below.
linear: linear warmup config.
polynomial: polynomial warmup config.
"""
type: Optional[str] = None
linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
@dataclasses.dataclass
class OptimizationConfig(base_config.Config):
"""Configuration for optimizer and learning rate schedule.
Attributes:
optimizer: optimizer oneof config.
ema: optional exponential moving average optimizer config, if specified, ema
optimizer will be used.
learning_rate: learning rate oneof config.
warmup: warmup oneof config.
"""
optimizer: OptimizerConfig = OptimizerConfig()
ema: Optional[opt_cfg.EMAConfig] = None
learning_rate: LrConfig = LrConfig()
warmup: WarmupConfig = WarmupConfig()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for optimization_config.py."""
import tensorflow as tf
from official.modeling.optimization.configs import learning_rate_config as lr_cfg
from official.modeling.optimization.configs import optimization_config
from official.modeling.optimization.configs import optimizer_config as opt_cfg
class OptimizerConfigTest(tf.test.TestCase):
def test_no_optimizer(self):
optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
self.assertIsNone(optimizer)
def test_no_lr_schedule(self):
lr = optimization_config.OptimizationConfig({}).learning_rate.get()
self.assertIsNone(lr)
def test_no_warmup_schedule(self):
warmup = optimization_config.OptimizationConfig({}).warmup.get()
self.assertIsNone(warmup)
def test_config(self):
opt_config = optimization_config.OptimizationConfig({
'optimizer': {
'type': 'sgd',
'sgd': {} # default config
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {}
},
'warmup': {
'type': 'linear'
}
})
self.assertEqual(opt_config.optimizer.get(), opt_cfg.SGDConfig())
self.assertEqual(opt_config.learning_rate.get(),
lr_cfg.PolynomialLrConfig())
self.assertEqual(opt_config.warmup.get(), lr_cfg.LinearWarmupConfig())
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataclasses for optimizer configs."""
from typing import List, Optional
import dataclasses
from official.modeling.hyperparams import base_config
@dataclasses.dataclass
class BaseOptimizerConfig(base_config.Config):
"""Base optimizer config.
Attributes:
clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
their L2 norm exceeds this value.
clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
their absolute value exceeds this value.
global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
clipped so that their global norm is no higher than this value
"""
clipnorm: Optional[float] = None
clipvalue: Optional[float] = None
global_clipnorm: Optional[float] = None
@dataclasses.dataclass
class SGDConfig(BaseOptimizerConfig):
"""Configuration for SGD optimizer.
The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
Attributes:
name: name of the optimizer.
decay: decay rate for SGD optimizer.
nesterov: nesterov for SGD optimizer.
momentum: momentum for SGD optimizer.
"""
name: str = "SGD"
decay: float = 0.0
nesterov: bool = False
momentum: float = 0.0
@dataclasses.dataclass
class RMSPropConfig(BaseOptimizerConfig):
"""Configuration for RMSProp optimizer.
The attributes for this class matches the arguments of
tf.keras.optimizers.RMSprop.
Attributes:
name: name of the optimizer.
rho: discounting factor for RMSprop optimizer.
momentum: momentum for RMSprop optimizer.
epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
centered: Whether to normalize gradients or not.
"""
name: str = "RMSprop"
rho: float = 0.9
momentum: float = 0.0
epsilon: float = 1e-7
centered: bool = False
@dataclasses.dataclass
class AdagradConfig(BaseOptimizerConfig):
"""Configuration for Adagrad optimizer.
The attributes of this class match the arguments of
tf.keras.optimizer.Adagrad.
Attributes:
name: name of the optimizer.
initial_accumulator_value: A floating point value. Starting value for the
accumulators, must be non-negative.
epsilon: A small floating point value to avoid zero denominator.
"""
name: str = "Adagrad"
initial_accumulator_value: float = 0.1
epsilon: float = 1e-07
@dataclasses.dataclass
class AdamConfig(BaseOptimizerConfig):
"""Configuration for Adam optimizer.
The attributes for this class matches the arguments of
tf.keras.optimizer.Adam.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in Adam optimizer.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
"""
name: str = "Adam"
beta_1: float = 0.9
beta_2: float = 0.999
epsilon: float = 1e-07
amsgrad: bool = False
@dataclasses.dataclass
class AdamWeightDecayConfig(BaseOptimizerConfig):
"""Configuration for Adam optimizer with weight decay.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in the optimizer.
amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
the paper "On the Convergence of Adam and beyond".
weight_decay_rate: float. Weight decay rate. Default to 0.
include_in_weight_decay: list[str], or None. List of weight names to include
in weight decay.
exclude_from_weight_decay: list[str], or None. List of weight names to not
include in weight decay.
gradient_clip_norm: A positive float. Clips the gradients to this maximum
L2-norm. Default to 1.0.
"""
name: str = "AdamWeightDecay"
beta_1: float = 0.9
beta_2: float = 0.999
epsilon: float = 1e-07
amsgrad: bool = False
weight_decay_rate: float = 0.0
include_in_weight_decay: Optional[List[str]] = None
exclude_from_weight_decay: Optional[List[str]] = None
gradient_clip_norm: float = 1.0
@dataclasses.dataclass
class LAMBConfig(BaseOptimizerConfig):
"""Configuration for LAMB optimizer.
The attributes for this class matches the arguments of
tensorflow_addons.optimizers.LAMB.
Attributes:
name: name of the optimizer.
beta_1: decay rate for 1st order moments.
beta_2: decay rate for 2st order moments.
epsilon: epsilon value used for numerical stability in LAMB optimizer.
weight_decay_rate: float. Weight decay rate. Default to 0.
exclude_from_weight_decay: List of regex patterns of variables excluded from
weight decay. Variables whose name contain a substring matching the
pattern will be excluded.
exclude_from_layer_adaptation: List of regex patterns of variables excluded
from layer adaptation. Variables whose name contain a substring matching
the pattern will be excluded.
"""
name: str = "LAMB"
beta_1: float = 0.9
beta_2: float = 0.999
epsilon: float = 1e-6
weight_decay_rate: float = 0.0
exclude_from_weight_decay: Optional[List[str]] = None
exclude_from_layer_adaptation: Optional[List[str]] = None
@dataclasses.dataclass
class EMAConfig(BaseOptimizerConfig):
"""Exponential moving average optimizer config.
Attributes:
name: 'str', name of the optimizer.
trainable_weights_only: 'bool', if True, only model trainable weights will
be updated. Otherwise, all model weights will be updated. This mainly
affects batch normalization parameters.
average_decay: 'float', average decay value.
start_step: 'int', start step to apply moving average.
dynamic_decay: 'bool', whether to apply dynamic decay or not.
"""
name: str = "ExponentialMovingAverage"
trainable_weights_only: bool = True
average_decay: float = 0.99
start_step: int = 0
dynamic_decay: bool = True
@dataclasses.dataclass
class LARSConfig(BaseOptimizerConfig):
"""Layer-wise adaptive rate scaling config.
Attributes:
name: 'str', name of the optimizer.
momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
the relevant direction and dampens oscillations. Defaults to 0.9.
eeta: `float` LARS coefficient as used in the paper. Default set to LARS
coefficient from the paper. (eeta / weight_decay) determines the highest
scaling factor in LARS..
weight_decay_rate: `float` for weight decay.
nesterov: 'boolean' for whether to use nesterov momentum.
classic_momentum: `boolean` for whether to use classic (or popular)
momentum. The learning rate is applied during momentum update in classic
momentum, but after momentum for popular momentum.
exclude_from_weight_decay: A list of `string` for variable screening, if any
of the string appears in a variable's name, the variable will be excluded
for computing weight decay. For example, one could specify the list like
['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
layer adaptation. If it is None, it will be defaulted the same as
exclude_from_weight_decay.
"""
name: str = "LARS"
momentum: float = 0.9
eeta: float = 0.001
weight_decay_rate: float = 0.0
nesterov: bool = False
classic_momentum: bool = True
exclude_from_weight_decay: Optional[List[str]] = None
exclude_from_layer_adaptation: Optional[List[str]] = None
@dataclasses.dataclass
class SLIDEConfig(BaseOptimizerConfig):
"""Configuration for SLIDE optimizer.
Details coming soon.
"""
name: str = "SLIDE"
beta_1: float = 0.9
beta_2: float = 0.999
epsilon: float = 1e-6
weight_decay_rate: float = 0.0
weight_decay_type: str = "inner"
exclude_from_weight_decay: Optional[List[str]] = None
exclude_from_layer_adaptation: Optional[List[str]] = None
include_in_sparse_layer_adaptation: Optional[List[str]] = None
sparse_layer_learning_rate: float = 0.1
do_gradient_rescaling: bool = True
norm_type: str = "layer"
ratio_clip_norm: float = 1e5
@dataclasses.dataclass
class AdafactorConfig(BaseOptimizerConfig):
"""Configuration for Adafactor optimizer.
The attributes for this class matches the arguments of the Adafactor
implementation.
"""
name: str = "Adafactor"
factored: bool = True
multiply_by_parameter_scale: bool = True
beta1: Optional[float] = None
decay_rate: float = 0.8
step_offset: int = 0
clipping_threshold: float = 1.0
min_dim_size_to_factor: int = 128
epsilon1: float = 1e-30
epsilon2: float = 1e-3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exponential moving average optimizer."""
from typing import List, Optional, Text
import tensorflow as tf
# pylint: disable=protected-access
class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
"""Optimizer that computes an exponential moving average of the variables.
Empirically it has been found that using the moving average of the trained
parameters of a deep network is better than using its trained parameters
directly. This optimizer allows you to compute this moving average and swap
the variables at save time so that any code outside of the training loop
will use by default the average values instead of the original ones.
Example of usage for training:
```python
opt = tf.keras.optimizers.SGD(learning_rate)
opt = ExponentialMovingAverage(opt)
opt.shadow_copy(model)
```
At test time, swap the shadow variables to evaluate on the averaged weights:
```python
opt.swap_weights()
# Test eval the model here
opt.swap_weights()
```
"""
def __init__(self,
optimizer: tf.keras.optimizers.Optimizer,
trainable_weights_only: bool = True,
average_decay: float = 0.99,
start_step: int = 0,
dynamic_decay: bool = True,
name: Text = 'ExponentialMovingAverage',
**kwargs):
"""Construct a new ExponentialMovingAverage optimizer.
Args:
optimizer: `tf.keras.optimizers.Optimizer` that will be
used to compute and apply gradients.
trainable_weights_only: 'bool', if True, only model trainable weights will
be updated. Otherwise, all model weights will be updated. This mainly
affects batch normalization parameters.
average_decay: float. Decay to use to maintain the moving averages
of trained variables.
start_step: int. What step to start the moving average.
dynamic_decay: bool. Whether to change the decay based on the number
of optimizer updates. Decay will start at 0.1 and gradually increase
up to `average_decay` after each optimizer update. This behavior is
similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
name: Optional name for the operations created when applying
gradients. Defaults to "moving_average".
**kwargs: keyword arguments. Allowed to be {`clipnorm`,
`clipvalue`, `lr`, `decay`}.
"""
super().__init__(name, **kwargs)
self._average_decay = average_decay
self._trainable_weights_only = trainable_weights_only
self._start_step = tf.constant(start_step, tf.float32)
self._dynamic_decay = dynamic_decay
self._optimizer = optimizer
self._track_trackable(self._optimizer, 'base_optimizer')
self._average_weights = None
self._model_weights = None
def shadow_copy(self, model: tf.keras.Model):
"""Creates shadow variables for the given model weights."""
if self._trainable_weights_only:
self._model_weights = model.trainable_variables
else:
self._model_weights = model.variables
for var in self._model_weights:
self.add_slot(var, 'average', initializer='zeros')
self._average_weights = [
self.get_slot(var, 'average') for var in self._model_weights
]
@property
def has_shadow_copy(self):
"""Whether this optimizer has created shadow variables."""
return self._model_weights is not None and self._average_weights is not None
def _create_slots(self, var_list):
self._optimizer._create_slots(var_list=var_list) # pylint: disable=protected-access
def apply_gradients(self, grads_and_vars, name: Optional[Text] = None):
result = self._optimizer.apply_gradients(grads_and_vars, name)
self.update_average(self.iterations)
return result
@tf.function
def update_average(self, step: tf.Tensor):
step = tf.cast(step, tf.float32)
if step < self._start_step:
decay = tf.constant(0., tf.float32)
elif self._dynamic_decay:
decay = step - self._start_step
decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
else:
decay = self._average_decay
def _apply_moving(v_moving, v_normal):
diff = v_moving - v_normal
v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
return v_moving
def _update(strategy, v_moving_and_v_normal):
for v_moving, v_normal in v_moving_and_v_normal:
strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(_update, args=(zip(self._average_weights,
self._model_weights),))
def swap_weights(self):
"""Swap the average and moving weights.
This is a convenience method to allow one to evaluate the averaged weights
at test time. Loads the weights stored in `self._average` into the model,
keeping a copy of the original model weights. Swapping twice will return
the original weights.
"""
if tf.distribute.in_cross_replica_context():
strategy = tf.distribute.get_strategy()
strategy.run(self._swap_weights, args=())
else:
raise ValueError('Swapping weights must occur under a '
'tf.distribute.Strategy')
@tf.function
def _swap_weights(self):
def fn_0(a, b):
a.assign_add(b)
return a
def fn_1(b, a):
b.assign(a - b)
return b
def fn_2(a, b):
a.assign_sub(b)
return a
def swap(strategy, a_and_b):
"""Swap `a` and `b` and mirror to all devices."""
for a, b in a_and_b:
strategy.extended.update(a, fn_0, args=(b,)) # a = a + b
strategy.extended.update(b, fn_1, args=(a,)) # b = a - b
strategy.extended.update(a, fn_2, args=(b,)) # a = a - b
ctx = tf.distribute.get_replica_context()
return ctx.merge_call(
swap, args=(zip(self._average_weights, self._model_weights),))
def assign_average_vars(self, var_list: List[tf.Variable]):
"""Assign variables in var_list with their respective averages.
Args:
var_list: List of model variables to be assigned to their average.
Returns:
assign_op: The op corresponding to the assignment operation of
variables to their average.
"""
assign_op = tf.group([
var.assign(self.get_slot(var, 'average')) for var in var_list
if var.trainable
])
return assign_op
def _create_hypers(self):
self._optimizer._create_hypers() # pylint: disable=protected-access
def _prepare(self, var_list):
return self._optimizer._prepare(var_list=var_list) # pylint: disable=protected-access
@property
def iterations(self):
return self._optimizer.iterations
@iterations.setter
def iterations(self, variable):
self._optimizer.iterations = variable
@property
def weights(self):
# return self._weights + self._optimizer.weights
return self._optimizer.weights
def variables(self):
return self._weights + [self.iterations]
@property
def lr(self):
return self._optimizer._get_hyper('learning_rate')
@lr.setter
def lr(self, lr):
self._optimizer._set_hyper('learning_rate', lr)
@property
def learning_rate(self):
return self._optimizer._get_hyper('learning_rate')
@learning_rate.setter
def learning_rate(self, learning_rate): # pylint: disable=redefined-outer-name
self._optimizer._set_hyper('learning_rate', learning_rate)
def _resource_apply_dense(self, grad, var):
return self._optimizer._resource_apply_dense(grad, var)
def _resource_apply_sparse(self, grad, var, indices):
return self._optimizer._resource_apply_sparse(grad, var, indices)
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
return self._optimizer._resource_apply_sparse_duplicate_indices(
grad, var, indices)
def get_config(self):
config = {
'optimizer': tf.keras.optimizers.serialize(self._optimizer),
'average_decay': self._average_decay,
'start_step': self._start_step,
'dynamic_decay': self._dynamic_decay,
}
base_config = super(ExponentialMovingAverage, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@classmethod
def from_config(cls, config, custom_objects=None):
optimizer = tf.keras.optimizers.deserialize(
config.pop('optimizer'),
custom_objects=custom_objects,
)
return cls(optimizer, **config)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Layer-wise adaptive rate scaling optimizer."""
import re
from typing import Text, List, Optional
import tensorflow as tf
# pylint: disable=protected-access
class LARS(tf.keras.optimizers.Optimizer):
"""Layer-wise Adaptive Rate Scaling for large batch training.
Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
"""
def __init__(self,
learning_rate: float = 0.01,
momentum: float = 0.9,
weight_decay_rate: float = 0.0,
eeta: float = 0.001,
nesterov: bool = False,
classic_momentum: bool = True,
exclude_from_weight_decay: Optional[List[Text]] = None,
exclude_from_layer_adaptation: Optional[List[Text]] = None,
name: Text = "LARS",
**kwargs):
"""Constructs a LARSOptimizer.
Args:
learning_rate: `float` for learning rate. Defaults to 0.01.
momentum: `float` hyperparameter >= 0 that accelerates gradient descent
in the relevant direction and dampens oscillations. Defaults to 0.9.
weight_decay_rate: `float` for weight decay.
eeta: `float` LARS coefficient as used in the paper. Default set to LARS
coefficient from the paper. (eeta / weight_decay) determines the
highest scaling factor in LARS..
nesterov: 'boolean' for whether to use nesterov momentum.
classic_momentum: `boolean` for whether to use classic (or popular)
momentum. The learning rate is applied during momentum update in
classic momentum, but after momentum for popular momentum.
exclude_from_weight_decay: A list of `string` for variable screening, if
any of the string appears in a variable's name, the variable will be
excluded for computing weight decay. For example, one could specify
the list like ['batch_normalization', 'bias'] to exclude BN and bias
from weight decay.
exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
for layer adaptation. If it is None, it will be defaulted the same as
exclude_from_weight_decay.
name: `Text` as optional name for the operations created when applying
gradients. Defaults to "LARS".
**kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
`decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
gradients by value, `decay` is included for backward compatibility to
allow time inverse decay of learning rate. `lr` is included for
backward compatibility, recommended to use `learning_rate` instead.
"""
super(LARS, self).__init__(name, **kwargs)
self._set_hyper("learning_rate", learning_rate)
self._set_hyper("decay", self._initial_decay)
self.momentum = momentum
self.weight_decay_rate = weight_decay_rate
self.eeta = eeta
self.nesterov = nesterov
self.classic_momentum = classic_momentum
self.exclude_from_weight_decay = exclude_from_weight_decay
# exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
# arg is None.
if exclude_from_layer_adaptation:
self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
else:
self.exclude_from_layer_adaptation = exclude_from_weight_decay
def _create_slots(self, var_list):
for v in var_list:
self.add_slot(v, "momentum")
def _resource_apply_dense(self, grad, param, apply_state=None):
if grad is None or param is None:
return tf.no_op()
var_device, var_dtype = param.device, param.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
self._fallback_apply_state(var_device, var_dtype))
learning_rate = coefficients["lr_t"]
param_name = param.name
v = self.get_slot(param, "momentum")
if self._use_weight_decay(param_name):
grad += self.weight_decay_rate * param
if self.classic_momentum:
trust_ratio = 1.0
if self._do_layer_adaptation(param_name):
w_norm = tf.norm(param, ord=2)
g_norm = tf.norm(grad, ord=2)
trust_ratio = tf.where(
tf.greater(w_norm, 0),
tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
1.0)
scaled_lr = learning_rate * trust_ratio
next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
if self.nesterov:
update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
else:
update = next_v
next_param = param - update
else:
next_v = tf.multiply(self.momentum, v) + grad
if self.nesterov:
update = tf.multiply(self.momentum, next_v) + grad
else:
update = next_v
trust_ratio = 1.0
if self._do_layer_adaptation(param_name):
w_norm = tf.norm(param, ord=2)
v_norm = tf.norm(update, ord=2)
trust_ratio = tf.where(
tf.greater(w_norm, 0),
tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
1.0)
scaled_lr = trust_ratio * learning_rate
next_param = param - scaled_lr * update
return tf.group(*[
param.assign(next_param, use_locking=False),
v.assign(next_v, use_locking=False)
])
def _resource_apply_sparse(self, grad, handle, indices, apply_state):
raise NotImplementedError("Applying sparse gradients is not implemented.")
def _use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _do_layer_adaptation(self, param_name):
"""Whether to do layer-wise learning rate adaptation for `param_name`."""
if self.exclude_from_layer_adaptation:
for r in self.exclude_from_layer_adaptation:
if re.search(r, param_name) is not None:
return False
return True
def get_config(self):
config = super(LARS, self).get_config()
config.update({
"learning_rate": self._serialize_hyperparameter("learning_rate"),
"decay": self._serialize_hyperparameter("decay"),
"momentum": self.momentum,
"classic_momentum": self.classic_momentum,
"weight_decay_rate": self.weight_decay_rate,
"eeta": self.eeta,
"nesterov": self.nesterov,
})
return config
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Learning rate schedule classes."""
import math
from typing import Mapping, Any, Union, Optional
import tensorflow as tf
def _make_offset_wrapper(new_class_name: str, base_lr_class):
"""Generates a offset wrapper of learning rate schedule.
It will returns a subclass of the the `base_lr_class`, the subclass takes an
`offset` argument in the constructor. When the new class instance is called,
the behavior is:
new_class_object(step) = base_lr_class_object(step - offset)
Example:
CosineDecayWithOffset = _make_offset_wrapper(
'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
# Use the lr:
lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
decay_steps=1000)
lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
Args:
new_class_name: the name of the new class.
base_lr_class: the base learning rate schedule class. Should be subclass of
tf.keras.optimizers.schedules.LearningRateSchedule
Returns:
A new class (subclass of the base_lr_class) that can take an offset.
"""
assert issubclass(base_lr_class,
tf.keras.optimizers.schedules.LearningRateSchedule), (
"base_lr_class should be subclass of keras "
f"LearningRateSchedule, got {base_lr_class}")
# pylint: disable=protected-access,pointless-statement
def offset_learning_rate_init(self, offset=0, **kwargs):
"""Construct learning rate schedule object.
When this object is called, its behavior is
self.__call__(step) == base_lr_class.__call__(step - offset)
Args:
self: this object.
offset: The offset when computing the learning rate schedule.
**kwargs: Pass through to base learning rate class constructor.
"""
base_lr_class.__init__(self, **kwargs)
self._offset = offset
def offset_learning_rate_call(self, step):
step = tf.cast(step - self._offset, tf.float32)
return base_lr_class.__call__(self, step)
# pylint: enable=protected-access,pointless-statement
return type(
new_class_name, (base_lr_class,), {
"base_lr_class": base_lr_class,
"__init__": offset_learning_rate_init,
"__call__": offset_learning_rate_call
})
PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
"PiecewiseConstantDecayWithOffset",
tf.keras.optimizers.schedules.PiecewiseConstantDecay)
PolynomialDecayWithOffset = _make_offset_wrapper(
"PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
ExponentialDecayWithOffset = _make_offset_wrapper(
"ExponentialDecayWithOffset",
tf.keras.optimizers.schedules.ExponentialDecay)
CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
tf.keras.experimental.CosineDecay)
class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Linear warmup schedule."""
def __init__(self,
after_warmup_lr_sched: Union[
tf.keras.optimizers.schedules.LearningRateSchedule, float],
warmup_steps: int,
warmup_learning_rate: float,
name: Optional[str] = None):
"""Add linear warmup schedule to a learning rate schedule.
warmup_lr is the initial learning rate, the final learning rate of the
init_warmup period is the initial learning rate of lr_schedule in use.
The learning rate at each step linearly increased according to the following
formula:
learning_rate = warmup_lr + step / warmup_steps
* (final_warmup_lr - warmup_lr).
Using warmup overrides the learning rate schedule by the number of warmup
steps.
Args:
after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
or a constant.
warmup_steps: Number of the warmup steps.
warmup_learning_rate: Initial learning rate for the warmup.
name: Optional, name of warmup schedule.
"""
super().__init__()
self._name = name
self._after_warmup_lr_sched = after_warmup_lr_sched
self._warmup_steps = warmup_steps
self._init_warmup_lr = warmup_learning_rate
if isinstance(after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
else:
self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
def __call__(self, step: int):
global_step = tf.cast(step, dtype=tf.float32)
linear_warmup_lr = (
self._init_warmup_lr + global_step / self._warmup_steps *
(self._final_warmup_lr - self._init_warmup_lr))
if isinstance(self._after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
after_warmup_lr = self._after_warmup_lr_sched(step)
else:
after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
lr = tf.cond(global_step < self._warmup_steps,
lambda: linear_warmup_lr,
lambda: after_warmup_lr)
return lr
def get_config(self) -> Mapping[str, Any]:
if isinstance(self._after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
config = {
"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error
else:
config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error
config.update({
"warmup_steps": self._warmup_steps,
"warmup_learning_rate": self._init_warmup_lr,
"name": self._name
})
return config
class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applies polynomial warmup schedule on a given learning rate decay schedule."""
def __init__(self,
after_warmup_lr_sched: Union[
tf.keras.optimizers.schedules.LearningRateSchedule, float],
warmup_steps: int,
power: float = 1.0,
name: str = "PolynomialWarmup"):
super().__init__()
if isinstance(after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
else:
self._initial_learning_rate = tf.cast(
after_warmup_lr_sched, dtype=tf.float32)
self._warmup_steps = warmup_steps
self._power = power
self._after_warmup_lr_sched = after_warmup_lr_sched
self._name = name
def __call__(self, step):
with tf.name_scope(self._name or "PolynomialWarmUp") as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
if self._warmup_steps <= 0:
warmup_percent_done = 1.0
else:
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero = tf.math.maximum(global_step_float, 1.0)
warmup_percent_done = step_non_zero / warmup_steps_float
warmup_learning_rate = (
self._initial_learning_rate *
tf.math.pow(warmup_percent_done, self._power))
if isinstance(self._after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
after_warmup_lr = self._after_warmup_lr_sched(step)
else:
after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
return tf.cond(
global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: after_warmup_lr,
name=name)
def get_config(self) -> Mapping[str, Any]:
if isinstance(self._after_warmup_lr_sched,
tf.keras.optimizers.schedules.LearningRateSchedule):
config = {
"after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()} # pytype: disable=attribute-error
else:
config = {"after_warmup_lr_sched": self._after_warmup_lr_sched} # pytype: disable=attribute-error
config.update({
"warmup_steps": self._warmup_steps,
"power": self._power,
"name": self._name
})
return config
class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Learning rate schedule follows lr * (step)^power."""
def __init__(self,
initial_learning_rate: float,
power: float = 1.0,
name: str = "DirectPowerDecay"):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
power: The order of the polynomial.
name: Optional, name of learning rate schedule.
"""
super().__init__()
self._initial_learning_rate = initial_learning_rate
self._power = power
self._name = name
def __call__(self, step):
with tf.name_scope(self._name or "DirectPowerDecay"):
step = tf.cast(step, tf.float32)
learning_rate = self._initial_learning_rate
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero = tf.math.maximum(step, 1.0)
learning_rate *= tf.math.pow(step_non_zero, self._power)
return learning_rate
def get_config(self):
"""Get the configuration of the learning rate schedule."""
return {
"initial_learning_rate": self._initial_learning_rate,
"power": self._power,
"name": self._name,
}
class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Learning rate schedule with multiplied by linear decay at the end.
The schedule has the following behavoir.
Let offset_step = step - offset.
1) offset_step < 0, the actual learning rate equals initial_learning_rate.
2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
actual learning rate equals lr * offset_step^power.
3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
total_decay_steps, the actual learning rate equals lr * offset_step^power *
(total_decay_steps - offset_step) / (total_decay_steps *
linear_decay_fraction).
4) offset_step >= total_decay_steps, the actual learning rate equals zero.
"""
def __init__(self,
initial_learning_rate: float,
total_decay_steps: int,
power: float = 1.0,
linear_decay_fraction: float = 0.1,
offset: int = 0,
name: str = "PowerAndLinearDecay"):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
total_decay_steps: The total number of steps for power + linear decay.
power: The order of the polynomial.
linear_decay_fraction: In the last `linear_decay_fraction` steps, the
learning rate will be multiplied by a linear decay.
offset: The offset applied to steps.
name: Optional, name of learning rate schedule.
"""
super().__init__()
self._initial_learning_rate = initial_learning_rate
self._total_decay_steps = total_decay_steps
self._power = power
self._linear_decay_fraction = linear_decay_fraction
self._offset = offset
self._name = name
def __call__(self, step):
with tf.name_scope(self._name or "PowerAndLinearDecay"):
step = tf.cast(step - self._offset, tf.float32)
learning_rate = self._initial_learning_rate
# A zero `step` may cause Inf. So make `step` positive.
step_non_zero = tf.math.maximum(step, 1.0)
learning_rate *= tf.math.pow(step_non_zero, self._power)
if self._total_decay_steps * self._linear_decay_fraction > 0:
learning_rate *= tf.minimum(
1.0, (self._total_decay_steps - step) /
(self._total_decay_steps * self._linear_decay_fraction))
learning_rate = tf.maximum(0.0, learning_rate)
return learning_rate
def get_config(self):
"""Get the configuration of the learning rate schedule."""
return {
"initial_learning_rate": self._initial_learning_rate,
"total_decay_steps": self._total_decay_steps,
"power": self._power,
"linear_decay_fraction": self._linear_decay_fraction,
"offset": self._offset,
"name": self._name,
}
class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Power learning rate decay with offset.
Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
Otherwise, learning rate equals to lr * (step - offset)^power.
"""
def __init__(self,
initial_learning_rate: float,
power: float = 1.0,
offset: int = 0,
pre_offset_learning_rate: float = 1.0e6,
name: str = "PowerDecayWithOffset"):
"""Initialize configuration of the learning rate schedule.
Args:
initial_learning_rate: The initial learning rate.
power: The order of the polynomial.
offset: The offset when computing the power decay.
pre_offset_learning_rate: The maximum learning rate we'll use.
name: Optional, name of learning rate schedule.
"""
super().__init__()
self._initial_learning_rate = initial_learning_rate
self._power = power
self._offset = offset
self._pre_offset_lr = pre_offset_learning_rate
self._name = name
def __call__(self, step):
with tf.name_scope(self._name or "PowerDecayWithOffset"):
step = tf.cast(step, tf.float32)
lr_after_offset = tf.math.pow(
tf.math.maximum(step - self._offset, 1.0), self._power) * (
self._initial_learning_rate)
sign = tf.cast(step > self._offset, tf.float32)
lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
# Power may give infinitely large LR. So cap it with pre_offset_lr.
return tf.math.minimum(lr_combined, self._pre_offset_lr)
def get_config(self):
"""Get the configuration of the learning rate schedule."""
return {
"initial_learning_rate": self._initial_learning_rate,
"power": self._power,
"offset": self._offset,
"pre_offset_learning_rate": self._pre_offset_lr,
"name": self._name,
}
class StepConsineDecayWithOffset(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""Stepwise cosine learning rate decay with offset.
Learning rate is equivalent to one or more consine decay(s) starting and
ending at each interval.
ExampleL
```python
boundaries: [100000, 110000]
values: [1.0, 0.5]
lr_decayed_fn = (
lr_schedule.StepConsineDecayWithOffset(
boundaries,
values))
```
from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
"""
def __init__(self,
boundaries,
values,
offset: int = 0,
name: str = "StepConsineDecayWithOffset"):
"""Initialize configuration of the learning rate schedule.
Args:
boundaries: A list of `Tensor`s or `int`s with strictly
increasing entries, and with all elements having the same type as the
optimizer step.
values: A list of `Tensor`s or `float`s that specifies the
values for the intervals defined by `boundaries`. It should have one
more element than `boundaries`, and all elements should have the same
type.
offset: The offset when computing the power decay.
name: Optional, name of learning rate schedule.
"""
super().__init__()
self.values = values
self.boundaries = boundaries
self.offset = offset
self.name = name
if len(self.values) < 1:
raise ValueError(f"Expect non empty {self.values}")
if len(self.boundaries) != len(self.values):
raise ValueError(
"Boundaries length is equal to learning rate levels length"
f"{len(self.boundaries)} != {len(self.values)}")
self.total_steps = (
[boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
] + [0])
def __call__(self, global_step):
with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
global_step = tf.cast(global_step - self.offset, tf.float32)
lr_levels = self.values
lr_steps = self.boundaries
level_total_steps = self.total_steps
num_levels = len(lr_levels)
init_lr = lr_levels[0]
next_init_lr = lr_levels[1] if num_levels > 1 else 0.
init_total_steps = level_total_steps[0]
cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
tf.constant(math.pi) * (global_step) /
(init_total_steps)) + 1.0) / 2.0 + next_init_lr)
learning_rate = cosine_learning_rate
tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
cosine_learning_rate)
tf.compat.v1.logging.info("DEBUG lr %r next lr %r inittotalstep %r",
init_lr, next_init_lr, init_total_steps)
for i in range(1, num_levels):
next_init_lr = lr_levels[i]
next_start_step = lr_steps[i]
next_total_steps = level_total_steps[i]
next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.
tf.compat.v1.logging.info(
"DEBUG step %r nilr %r nss %r nts %r nnilr %r", global_step,
next_init_lr, next_start_step, next_total_steps, next_next_init_lr)
next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
(tf.cos(
tf.constant(math.pi) *
(global_step - next_start_step) /
(next_total_steps)) + 1.0) / 2.0 +
next_next_init_lr)
learning_rate = tf.where(global_step >= next_start_step,
next_cosine_learning_rate, learning_rate)
tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
next_cosine_learning_rate)
return learning_rate
def get_config(self):
return {
"boundaries": self.boundaries,
"values": self.values,
"offset": self.offset,
"name": self.name
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment