# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Datastructures for all the configurations for MobileBERT-EdgeTPU training."""
import dataclasses
from typing import Optional

from official.modeling import optimization
from official.modeling.hyperparams import base_config
from official.nlp.configs import bert
from official.nlp.data import pretrain_dataloader

DatasetParams = pretrain_dataloader.BertPretrainDataConfig
PretrainerModelParams = bert.PretrainerConfig


@dataclasses.dataclass
class OrbitParams(base_config.Config):
  """Parameters that setup Orbit training/evaluation pipeline.

  Attributes:
    mode: Orbit controller mode, can be 'train', 'train_and_evaluate', or
      'evaluate'.
    steps_per_loop: The number of steps to run in each inner loop of training.
    total_steps: The global step count to train up to.
    eval_steps: The number of steps to run during an evaluation. If -1, this
      method will evaluate over the entire evaluation dataset.
    eval_interval: The number of training steps to run between evaluations. If
      set, training will always stop every `eval_interval` steps, even if this
      results in a shorter inner loop than specified by `steps_per_loop`
      setting. If None, evaluation will only be performed after training is
      complete.
  """
  mode: str = 'train'
  steps_per_loop: int = 1000
  total_steps: int = 1000000
  eval_steps: int = -1
  eval_interval: Optional[int] = None


@dataclasses.dataclass
class OptimizerParams(optimization.OptimizationConfig):
  """Optimizer parameters for MobileBERT-EdgeTPU."""
  optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
      type='adamw',
      adamw=optimization.AdamWeightDecayConfig(
          weight_decay_rate=0.01,
          exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']))
  learning_rate: optimization.LrConfig = optimization.LrConfig(
      type='polynomial',
      polynomial=optimization.PolynomialLrConfig(
          initial_learning_rate=1e-4,
          decay_steps=1000000,
          end_learning_rate=0.0))
  warmup: optimization.WarmupConfig = optimization.WarmupConfig(
      type='polynomial',
      polynomial=optimization.PolynomialWarmupConfig(warmup_steps=10000))


@dataclasses.dataclass
class RuntimeParams(base_config.Config):
  """Parameters that set up the training runtime.

  TODO(longy): Can reuse the Runtime Config in:
  official/core/config_definitions.py

  Attributes
    distribution_strategy: Keras distribution strategy
    use_gpu: Whether to use GPU
    use_tpu: Whether to use TPU
    num_gpus: Number of gpus to use for training
    num_workers: Number of parallel workers
    tpu_address: The bns address of the TPU to use.
  """
  distribution_strategy: str = 'off'
  num_gpus: Optional[int] = 0
  all_reduce_alg: Optional[str] = None
  num_workers: int = 1
  tpu_address: str = ''
  use_gpu: Optional[bool] = None
  use_tpu: Optional[bool] = None


@dataclasses.dataclass
class LayerWiseDistillationParams(base_config.Config):
  """Define the behavior of layer-wise distillation.

  Layer-wise distillation is an optional step where the knowledge is transferred
  layerwisely for all the transformer layers. The end-to-end distillation is
  performed after layer-wise distillation if layer-wise distillation steps is
  not zero.
  """
  num_steps: int = 10000
  warmup_steps: int = 10000
  initial_learning_rate: float = 1.5e-3
  end_learning_rate: float = 1.5e-3
  decay_steps: int = 10000
  hidden_distill_factor: float = 100.0
  beta_distill_factor: float = 5000.0
  gamma_distill_factor: float = 5.0
  attention_distill_factor: float = 1.0


@dataclasses.dataclass
class EndToEndDistillationParams(base_config.Config):
  """Define the behavior of end2end pretrainer distillation."""
  num_steps: int = 580000
  warmup_steps: int = 20000
  initial_learning_rate: float = 1.5e-3
  end_learning_rate: float = 1.5e-7
  decay_steps: int = 580000
  distill_ground_truth_ratio: float = 0.5


@dataclasses.dataclass
class EdgeTPUBERTCustomParams(base_config.Config):
  """EdgeTPU-BERT custom params.

  Attributes:
    train_dataset: An instance of the DatasetParams.
    eval_dataset: An instance of the DatasetParams.
    teacher_model: An instance of the PretrainerModelParams. If None, then the
      student model is trained independently without distillation.
    student_model: An instance of the PretrainerModelParams
    teacher_model_init_checkpoint: Path for the teacher model init checkpoint.
    student_model_init_checkpoint: Path for the student model init checkpoint.
    layer_wise_distillation: Distillation config for the layer-wise step.
    end_to_end_distillation: Distillation config for the end2end step.
    optimizer: An instance of the OptimizerParams.
    runtime: An instance of the RuntimeParams.
    learning_rate: An instance of the LearningRateParams.
    orbit_config: An instance of the OrbitParams.
    distill_ground_truth_ratio: A float number representing the ratio between
      distillation output and ground truth.
  """
  train_datasest: DatasetParams = DatasetParams()
  eval_dataset: DatasetParams = DatasetParams()
  teacher_model: Optional[PretrainerModelParams] = PretrainerModelParams()
  student_model: PretrainerModelParams = PretrainerModelParams()
  teacher_model_init_checkpoint: str = ''
  student_model_init_checkpoint: str = ''
  layer_wise_distillation: LayerWiseDistillationParams = (
      LayerWiseDistillationParams())
  end_to_end_distillation: EndToEndDistillationParams = (
      EndToEndDistillationParams())
  optimizer: OptimizerParams = OptimizerParams()
  runtime: RuntimeParams = RuntimeParams()
  orbit_config: OrbitParams = OrbitParams()