Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 32e4ca51 · 32e4ca51
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/modeling/multitask/configs.py
+++ b/official/modeling/multitask/configs.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ import dataclasses
 from official.core import config_definitions as cfg
 from official.modeling import hyperparams
+from official.modeling.privacy import configs as dp_configs
 @dataclasses.dataclass
@@ -35,6 +36,8 @@ class MultiTaskConfig(hyperparams.Config):
  init_checkpoint: str = ""
  model: hyperparams.Config = None
  task_routines: Tuple[TaskRoutine, ...] = ()
+  differential_privacy_config: Optional[
+      dp_configs.DifferentialPrivacyConfig] = None
 @dataclasses.dataclass

--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/multitask/evaluator_test.py
+++ b/official/modeling/multitask/evaluator_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/multitask/interleaving_trainer.py
+++ b/official/modeling/multitask/interleaving_trainer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,7 +31,9 @@ class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
               multi_task: multitask.MultiTask,
               multi_task_model: Union[tf.keras.Model,
                                       base_model.MultiTaskBaseModel],
-               optimizer: tf.optimizers.Optimizer,
+               optimizer: Union[tf.optimizers.Optimizer,
+                                tf.keras.optimizers.experimental.Optimizer,
+                                tf.keras.optimizers.legacy.Optimizer],
               task_sampler: sampler.TaskSampler,
               trainer_options=None):
    super().__init__(
@@ -69,6 +71,13 @@ class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
        name: orbit.utils.create_global_step() for name in self.multi_task.tasks
    }
+    # If the new Keras optimizer is used, we require all model variables are
+    # created before the training and let the optimizer to create the slot
+    # variable all together.
+    if isinstance(optimizer, tf.keras.optimizers.experimental.Optimizer):
+      multi_task_model.build()
+      optimizer.build(multi_task_model.trainable_variables)
  def task_step_counter(self, name):
    return self._task_step_counters[name]

--- a/official/modeling/multitask/interleaving_trainer_test.py
+++ b/official/modeling/multitask/interleaving_trainer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/multitask/multitask.py
+++ b/official/modeling/multitask/multitask.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,9 +23,11 @@ from official.core import task_factory
 from official.modeling import optimization
 from official.modeling.multitask import base_model
 from official.modeling.multitask import configs
+from official.modeling.privacy import configs as dp_configs
 OptimizationConfig = optimization.OptimizationConfig
 RuntimeConfig = config_definitions.RuntimeConfig
+DifferentialPrivacyConfig = dp_configs.DifferentialPrivacyConfig
 class MultiTask(tf.Module, metaclass=abc.ABCMeta):
@@ -93,9 +95,11 @@ class MultiTask(tf.Module, metaclass=abc.ABCMeta):
  @classmethod
  def create_optimizer(cls,
                       optimizer_config: OptimizationConfig,
-                       runtime_config: Optional[RuntimeConfig] = None):
+                       runtime_config: Optional[RuntimeConfig] = None,
+                       dp_config: Optional[DifferentialPrivacyConfig] = None):
    return base_task.Task.create_optimizer(
-        optimizer_config=optimizer_config, runtime_config=runtime_config)
+        optimizer_config=optimizer_config, runtime_config=runtime_config,
+        dp_config=dp_config)
  def joint_train_step(self, task_inputs,
                       multi_task_model: base_model.MultiTaskBaseModel,
@@ -134,10 +138,10 @@ class MultiTask(tf.Module, metaclass=abc.ABCMeta):
        self.tasks[name].process_metrics(task_metrics[name], labels, outputs,
                                         **kwargs)
-        # Scales loss as the default gradients allreduce performs sum inside
+      # Scales loss as the default gradients allreduce performs sum inside
-        # the optimizer.
+      # the optimizer.
-        scaled_loss = total_loss / tf.distribute.get_strategy(
+      scaled_loss = total_loss / tf.distribute.get_strategy(
-        ).num_replicas_in_sync
+      ).num_replicas_in_sync
    tvars = multi_task_model.trainable_variables
    grads = tape.gradient(scaled_loss, tvars)
    optimizer.apply_gradients(list(zip(grads, tvars)))

--- a/official/modeling/multitask/task_sampler.py
+++ b/official/modeling/multitask/task_sampler.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/multitask/task_sampler_test.py
+++ b/official/modeling/multitask/task_sampler_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/multitask/test_utils.py
+++ b/official/modeling/multitask/test_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@ class MockFooModel(tf.keras.Model):
    super().__init__(*args, **kwargs)
    self._share_layer = shared_layer
    self._foo_specific_layer = tf.keras.layers.Dense(1)
+    self.inputs = {"foo": tf.keras.Input(shape=(2,), dtype=tf.float32),
+                   "bar": tf.keras.Input(shape=(2,), dtype=tf.float32)}
  def call(self, inputs):
    self.add_loss(tf.zeros((1,), dtype=tf.float32))
@@ -39,11 +41,13 @@ class MockFooModel(tf.keras.Model):
 class MockBarModel(tf.keras.Model):
+  """A mock model can only consume 'bar' inputs."""
  def __init__(self, shared_layer, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self._share_layer = shared_layer
    self._bar_specific_layer = tf.keras.layers.Dense(1)
+    self.inputs = {"bar": tf.keras.Input(shape=(2,), dtype=tf.float32)}
  def call(self, inputs):
    self.add_loss(tf.zeros((2,), dtype=tf.float32))

--- a/official/modeling/multitask/train_lib.py
+++ b/official/modeling/multitask/train_lib.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 """Multitask training driver library."""
 # pytype: disable=attribute-error
 import os
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Mapping, Optional, Tuple, Union
 from absl import logging
 import orbit
 import tensorflow as tf
@@ -44,8 +44,12 @@ def run_experiment(
    mode: str,
    params: configs.MultiTaskExperimentConfig,
    model_dir: str,
-    trainer: base_trainer.MultiTaskBaseTrainer = None
+    run_post_eval: bool = False,
-) -> base_model.MultiTaskBaseModel:
+    trainer: base_trainer.MultiTaskBaseTrainer = None,
+    best_ckpt_exporter_creator: Optional[Any] = train_utils
+    .maybe_create_best_ckpt_exporter
+) -> Union[base_model.MultiTaskBaseModel, Tuple[base_model.MultiTaskBaseModel,
+                                                Mapping[Any, Any]]]:
  """Runs train/eval configured by the experiment params.
  Args:
@@ -56,8 +60,11 @@ def run_experiment(
      or 'continuous_eval'.
    params: ExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
    trainer: (optional) A multi-task trainer to use. If none is provided, a
      default one will be created based on `params`.
+    best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.
  Returns:
      model: `base_model.MultiTaskBaseModel` instance.
@@ -66,8 +73,7 @@ def run_experiment(
  is_training = 'train' in mode
  is_eval = 'eval' in mode
  with distribution_strategy.scope():
-    optimizer = task.create_optimizer(params.trainer.optimizer_config,
+    optimizer = train_utils.create_optimizer(task, params)
-                                      params.runtime)
    kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
    if params.trainer.trainer_type == 'interleaving':
      sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
@@ -83,8 +89,7 @@ def run_experiment(
          model=model,
          eval_steps=eval_steps,
          global_step=trainer.global_step if is_training else None,
-          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+          checkpoint_exporter=best_ckpt_exporter_creator(params, model_dir))
-              params, model_dir))
    else:
      evaluator = None
@@ -95,7 +100,6 @@ def run_experiment(
    checkpoint = evaluator.checkpoint
    global_step = evaluator.global_step
-  # TODO(hongkuny,haozhangthu): Revisit initialization method.
  checkpoint_manager = tf.train.CheckpointManager(
      checkpoint,
      directory=model_dir,
@@ -140,7 +144,11 @@ def run_experiment(
    else:
      raise NotImplementedError('The mode is not implemented: %s' % mode)
-    return model
+    if run_post_eval:
+      return model, evaluator.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))  # pytype: disable=bad-return-type  # typed-keras
+    else:
+      return model
 def run_experiment_with_multitask_eval(
@@ -153,7 +161,10 @@ def run_experiment_with_multitask_eval(
    model_dir: str,
    run_post_eval: bool = False,
    save_summary: bool = True,
-    trainer: Optional[core_lib.Trainer] = None) -> Tuple[Any, Any]:
+    trainer: Optional[core_lib.Trainer] = None,
+    best_ckpt_exporter_creator: Optional[Any] = train_utils
+    .maybe_create_best_ckpt_exporter,
+) -> Tuple[Any, Any]:
  """Runs train/eval configured by the experiment params.
  Args:
@@ -170,6 +181,7 @@ def run_experiment_with_multitask_eval(
    trainer: the core_lib.Trainer instance. It should be created within the
      strategy.scope(). If not provided, an instance will be created by default
      if `mode` contains 'train'.
+    best_ckpt_exporter_creator: A functor for creating best checkpoint exporter.
  Returns:
      model: `tf.keras.Model` instance.
@@ -183,8 +195,7 @@ def run_experiment_with_multitask_eval(
          config=params,
          task=train_task,
          model=train_task.build_model(),
-          optimizer=train_task.create_optimizer(params.trainer.optimizer_config,
+          optimizer=train_utils.create_optimizer(train_task, params),
-                                                params.runtime),
          train=True,
          evaluate=False)
    else:
@@ -200,8 +211,7 @@ def run_experiment_with_multitask_eval(
          model=model,
          global_step=trainer.global_step if is_training else None,
          eval_steps=eval_steps,
-          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+          checkpoint_exporter=best_ckpt_exporter_creator(params, model_dir))
-              params, model_dir))
    else:
      evaluator = None

--- a/official/modeling/multitask/train_lib_test.py
+++ b/official/modeling/multitask/train_lib_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,8 +58,9 @@ class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
              strategy_combinations.one_device_strategy_gpu,
          ],
          mode='eager',
+          optimizer=['sgd_experimental', 'sgd'],
          flag_mode=['train', 'eval', 'train_and_eval']))
-  def test_end_to_end(self, distribution_strategy, flag_mode):
+  def test_end_to_end(self, distribution_strategy, optimizer, flag_mode):
    model_dir = self.get_temp_dir()
    experiment_config = configs.MultiTaskExperimentConfig(
        task=configs.MultiTaskConfig(
@@ -70,6 +71,7 @@ class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
                    task_name='bar', task_config=test_utils.BarConfig()))))
    experiment_config = params_dict.override_params_dict(
        experiment_config, self._test_config, is_strict=False)
+    experiment_config.trainer.optimizer_config.optimizer.type = optimizer
    with distribution_strategy.scope():
      test_multitask = multitask.MultiTask.from_config(experiment_config.task)
      model = test_utils.MockMultiTaskModel()

--- a/official/modeling/optimization/__init__.py
+++ b/official/modeling/optimization/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/optimization/adafactor_optimizer.py
+++ b/official/modeling/optimization/adafactor_optimizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/optimization/configs/__init__.py
+++ b/official/modeling/optimization/configs/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -216,14 +216,14 @@ class StepCosineLrConfig(base_config.Config):
  """Configuration for stepwise learning rate decay.
  This class is a container for the piecewise cosine learning rate scheduling
-  configs. It will configure an instance of StepConsineDecayWithOffset keras
+  configs. It will configure an instance of StepCosineDecayWithOffset keras
  learning rate schedule.
    ```python
    boundaries: [100000, 110000]
    values: [1.0, 0.5]
    lr_decayed_fn = (
-    lr_schedule.StepConsineDecayWithOffset(
+    lr_schedule.StepCosineDecayWithOffset(
        boundaries,
        values))
    ```
@@ -243,7 +243,7 @@ class StepCosineLrConfig(base_config.Config):
              [boundaries[n], end]               -> values[n+1] to 0.
    offset: An int. The offset applied to steps. Defaults to 0.
  """
-  name: str = 'StepConsineDecayWithOffset'
+  name: str = 'StepCosineDecayWithOffset'
  boundaries: Optional[List[int]] = None
  values: Optional[List[float]] = None
  offset: int = 0

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,8 +45,14 @@ class OptimizerConfig(oneof.OneOfConfig):
  """
  type: Optional[str] = None
  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  sgd_experimental: opt_cfg.SGDExperimentalConfig = (
+      opt_cfg.SGDExperimentalConfig())
  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adam_experimental: opt_cfg.AdamExperimentalConfig = (
+      opt_cfg.AdamExperimentalConfig())
  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  adamw_experimental: opt_cfg.AdamWeightDecayExperimentalConfig = (
+      opt_cfg.AdamWeightDecayExperimentalConfig())
  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()

--- a/official/modeling/optimization/configs/optimization_config_test.py
+++ b/official/modeling/optimization/configs/optimization_config_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -54,6 +54,27 @@ class SGDConfig(BaseOptimizerConfig):
  momentum: float = 0.0
+# TODO(b/216129465): Merge this config with SGDConfig after the experimental
+# optimizer graduates.
+@dataclasses.dataclass
+class SGDExperimentalConfig(BaseOptimizerConfig):
+  """Configuration for SGD optimizer.
+  The attributes for this class matches the arguments of
+  `tf.keras.optimizer.experimental.SGD`.
+  Attributes:
+    name: name of the optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+    jit_compile: if True, jit compile will be used.
+  """
+  name: str = "SGD"
+  nesterov: bool = False
+  momentum: float = 0.0
+  jit_compile: bool = False
 @dataclasses.dataclass
 class RMSPropConfig(BaseOptimizerConfig):
  """Configuration for RMSProp optimizer.
@@ -115,6 +136,30 @@ class AdamConfig(BaseOptimizerConfig):
  amsgrad: bool = False
+@dataclasses.dataclass
+class AdamExperimentalConfig(BaseOptimizerConfig):
+  """Configuration for experimental Adam optimizer.
+  The attributes for this class matches the arguments of
+  `tf.keras.optimizer.experimental.Adam`.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    jit_compile: if True, jit compile will be used.
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  jit_compile: bool = False
 @dataclasses.dataclass
 class AdamWeightDecayConfig(BaseOptimizerConfig):
  """Configuration for Adam optimizer with weight decay.
@@ -145,6 +190,32 @@ class AdamWeightDecayConfig(BaseOptimizerConfig):
  gradient_clip_norm: float = 1.0
+@dataclasses.dataclass
+class AdamWeightDecayExperimentalConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer with weight decay.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    weight_decay: float. Weight decay rate. Default to 0.
+    global_clipnorm: A positive float. Clips the gradients to this maximum
+      L2-norm. Default to 1.0.
+    jit_compile: if True, jit compile will be used.
+  """
+  name: str = "AdamWeightDecayExperimental"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay: float = 0.0
+  global_clipnorm: float = 1.0
+  jit_compile: bool = False
 @dataclasses.dataclass
 class LAMBConfig(BaseOptimizerConfig):
  """Configuration for LAMB optimizer.
@@ -266,3 +337,5 @@ class AdafactorConfig(BaseOptimizerConfig):
  min_dim_size_to_factor: int = 128
  epsilon1: float = 1e-30
  epsilon2: float = 1e-3
+  weight_decay: Optional[float] = None
+  include_in_weight_decay: Optional[str] = None
--- a/official/modeling/optimization/ema_optimizer.py
+++ b/official/modeling/optimization/ema_optimizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ import tensorflow as tf
 # pylint: disable=protected-access
-class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+class ExponentialMovingAverage(tf.keras.optimizers.legacy.Optimizer):
  """Optimizer that computes an exponential moving average of the variables.
  Empirically it has been found that using the moving average of the trained

--- a/official/modeling/optimization/lars_optimizer.py
+++ b/official/modeling/optimization/lars_optimizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import tensorflow as tf
 # pylint: disable=protected-access
-class LARS(tf.keras.optimizers.Optimizer):
+class LARS(tf.keras.optimizers.legacy.Optimizer):
  """Layer-wise Adaptive Rate Scaling for large batch training.
  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,