Merge branch 'tensorflow:master' into exp_pr2

7a45b513 · Vishnu Banna · GitHub · 54115e16 · 12bbefce · 7a45b513
Unverified Commit 7a45b513 authored Oct 25, 2021 by Vishnu Banna Committed by GitHub Oct 25, 2021
20 changed files
--- a/official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
+++ b/official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
@@ -27,7 +27,7 @@ task:
      intermediate_size: 3072
      max_position_embeddings: 512
      num_attention_heads: 12
-      num_layers: 6
+      num_layers: 12
      type_vocab_size: 2
      vocab_size: 30522
  train_data:
@@ -39,6 +39,7 @@ task:
    seq_length: 512
    use_next_sentence_label: false
    use_position_id: false
+    cycle_length: 8
  validation_data:
    drop_remainder: true
    global_batch_size: 256

--- a/official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
+++ b/official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
@@ -39,6 +39,7 @@ task:
    seq_length: 512
    use_next_sentence_label: false
    use_position_id: false
+    cycle_length: 8
  validation_data:
    drop_remainder: true
    global_batch_size: 256

--- a/official/nlp/projects/teams/teams.py
+++ b/official/nlp/projects/teams/teams.py
@@ -51,9 +51,7 @@ class TeamsPretrainerConfig(base_config.Config):
 @gin.configurable
-def get_encoder(bert_config,
+def get_encoder(bert_config, embedding_network=None, hidden_layers=None):
-                embedding_network=None,
-                hidden_layers=layers.Transformer):
  """Gets a 'EncoderScaffold' object.
  Args:
@@ -85,7 +83,9 @@ def get_encoder(bert_config,
          stddev=bert_config.initializer_range),
  )
  if embedding_network is None:
-    embedding_network = networks.PackedSequenceEmbedding(**embedding_cfg)
+    embedding_network = networks.PackedSequenceEmbedding
+  if hidden_layers is None:
+    hidden_layers = layers.Transformer
  kwargs = dict(
      embedding_cfg=embedding_cfg,
      embedding_cls=embedding_network,

--- a/official/projects/edgetpu/nlp/README.md
+++ b/official/projects/edgetpu/nlp/README.md
+# MobileBERT-EdgeTPU
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-mobilebert.png>
+  <figcaption>Performance of MobileBERT-EdgeTPU models on the SQuAD v1.1 dataset.</figcaption>
+</figure>
+Note: For MobileBERT baseline float model, NNAPI delegates parts of the
+computing ops to CPU, making the latency much higher.
+Note: The accuracy numbers for BERT_base and BERT_large are from the
+[training results](https://arxiv.org/abs/1810.04805). These models are too large
+and not feasible to run on device.
+Deploying low-latency, high-quality transformer based language models on device
+is highly desirable, and can potentially benefit multiple applications such as
+automatic speech recognition (ASR), translation, sentence autocompletion, and
+even some vision tasks. By co-designing the neural networks with the Edge TPU
+hardware accelerator in Google Tensor SoC, we have built EdgeTPU-customized
+MobileBERT models that demonstrate datacenter model quality meanwhile
+outperforms baseline MobileBERT's latency.
+We set up our model architecture search space based on
+[MobileBERT](https://arxiv.org/abs/2004.02984) and leverage AutoML algorithms to
+find models with up to 2x better hardware utilization. With higher utilization,
+we are able to bring larger and more accurate models on chip, and meanwhile the
+models can still outperform the baseline MobileBERT latency. We built a
+customized distillation training pipeline and performed exhaustive
+hyperparameters (e.g. learning rate, dropout ratio, etc) search to achieve the
+best accuracy. As shown in the above figure, the quantized MobileBERT-EdgeTPU
+models establish a new pareto-frontier for the question answering tasks and also
+exceed the accuracy of the float BERT_base model which is 400+MB and too large
+to run on edge devices.
+We also observed that, unlike most vision models, the accuracy drops
+significantly for MobileBERT/MobileBERT-EdgeTPU with plain post training
+quantization (PTQ) or quantization aware training (QAT). Proper model
+modifications, such as clipping the mask value, are necessary to retain the
+accuracy for a quantized model. Therefore, as an alternative to the quant
+models, we also provide a set of Edge TPU friendly float models which also
+produce a better (though marginally) roofline than the baseline MobileBERT quant
+model. Notably, the float MobileBERT-EdgeTPU-M model yields accuracy that is
+even close to the BERT_large, which has 1.3GB model size in float precision.
+Quantization now becomes an optional optimization rather than a prerequisite,
+which can greatly benefit/unblock some use cases where quantization is
+infeasible or introduce large accuracy deterioration, and potentially reduce the
+time-to-market.
+## Pre-trained Models
+Model name            | # Parameters | # Ops  |  MLM   | Checkpoint | TFhub link
+--------------------- | :----------: | :----: | :---: | :---: | :--------:
+MobileBERT-EdgeTPU-M  | 50.9M        | 18.8e9 |  73.8% | WIP | WIP
+MobileBERT-EdgeTPU-S  | 38.3M        | 14.0e9 |  72.8% | WIP | WIP
+MobileBERT-EdgeTPU-XS | 27.1M        | 9.4e9  |  71.2% | WIP | WIP
+### Restoring from Checkpoints
+To load the pre-trained MobileBERT checkpoint in your code, please follow the
+example below or check the `serving/export_tflite_squad` module:
+```python
+import tensorflow as tf
+from official.nlp.projects.mobilebert_edgetpu import params
+bert_config_file = ...
+model_checkpoint_path = ...
+# Set up experiment params and load the configs from file/files.
+experiment_params = params.EdgeTPUBERTCustomParams()
+# change the input mask type to tf.float32 to avoid additional casting op.
+experiment_params.student_model.encoder.mobilebert.input_mask_dtype = 'float32'
+pretrainer_model = model_builder.build_bert_pretrainer(
+    experiment_params.student_model,
+    name='pretrainer',
+    quantization_friendly=True)
+checkpoint_dict = {'model': pretrainer_model}
+checkpoint = tf.train.Checkpoint(**checkpoint_dict)
+checkpoint.restore(FLAGS.model_checkpoint).assert_existing_objects_matched()
+```
+### Use TF-Hub models
+TODO(longy): Update with instructions to use tf-hub models
--- a/official/nlp/keras_nlp/layers/masked_lm.py
+++ b/official/nlp/keras_nlp/layers/masked_lm.py
@@ -12,8 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Masked language model network."""
-from official.nlp.modeling import layers
-MaskedLM = layers.MaskedLM
--- a/official/nlp/keras_nlp/encoders/__init__.py
+++ b/official/nlp/keras_nlp/encoders/__init__.py
@@ -12,5 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Keras-NLP layers package definition."""
-from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
--- a/official/projects/edgetpu/nlp/configs/params.py
+++ b/official/projects/edgetpu/nlp/configs/params.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Datastructures for all the configurations for MobileBERT-EdgeTPU training."""
+import dataclasses
+from typing import Optional
+from official.modeling import optimization
+from official.modeling.hyperparams import base_config
+from official.nlp.configs import bert
+from official.nlp.data import pretrain_dataloader
+DatasetParams = pretrain_dataloader.BertPretrainDataConfig
+PretrainerModelParams = bert.PretrainerConfig
+@dataclasses.dataclass
+class OrbitParams(base_config.Config):
+  """Parameters that setup Orbit training/evaluation pipeline.
+  Attributes:
+    mode: Orbit controller mode, can be 'train', 'train_and_evaluate', or
+      'evaluate'.
+    steps_per_loop: The number of steps to run in each inner loop of training.
+    total_steps: The global step count to train up to.
+    eval_steps: The number of steps to run during an evaluation. If -1, this
+      method will evaluate over the entire evaluation dataset.
+    eval_interval: The number of training steps to run between evaluations. If
+      set, training will always stop every `eval_interval` steps, even if this
+      results in a shorter inner loop than specified by `steps_per_loop`
+      setting. If None, evaluation will only be performed after training is
+      complete.
+  """
+  mode: str = 'train'
+  steps_per_loop: int = 1000
+  total_steps: int = 1000000
+  eval_steps: int = -1
+  eval_interval: Optional[int] = None
+@dataclasses.dataclass
+class OptimizerParams(optimization.OptimizationConfig):
+  """Optimizer parameters for MobileBERT-EdgeTPU."""
+  optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
+      type='adamw',
+      adamw=optimization.AdamWeightDecayConfig(
+          weight_decay_rate=0.01,
+          exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']))
+  learning_rate: optimization.LrConfig = optimization.LrConfig(
+      type='polynomial',
+      polynomial=optimization.PolynomialLrConfig(
+          initial_learning_rate=1e-4,
+          decay_steps=1000000,
+          end_learning_rate=0.0))
+  warmup: optimization.WarmupConfig = optimization.WarmupConfig(
+      type='polynomial',
+      polynomial=optimization.PolynomialWarmupConfig(warmup_steps=10000))
+@dataclasses.dataclass
+class RuntimeParams(base_config.Config):
+  """Parameters that set up the training runtime.
+  TODO(longy): Can reuse the Runtime Config in:
+  official/core/config_definitions.py
+  Attributes
+    distribution_strategy: Keras distribution strategy
+    use_gpu: Whether to use GPU
+    use_tpu: Whether to use TPU
+    num_gpus: Number of gpus to use for training
+    num_workers: Number of parallel workers
+    tpu_address: The bns address of the TPU to use.
+  """
+  distribution_strategy: str = 'off'
+  num_gpus: Optional[int] = 0
+  all_reduce_alg: Optional[str] = None
+  num_workers: int = 1
+  tpu_address: str = ''
+  use_gpu: Optional[bool] = None
+  use_tpu: Optional[bool] = None
+@dataclasses.dataclass
+class LayerWiseDistillationParams(base_config.Config):
+  """Define the behavior of layer-wise distillation.
+  Layer-wise distillation is an optional step where the knowledge is transferred
+  layerwisely for all the transformer layers. The end-to-end distillation is
+  performed after layer-wise distillation if layer-wise distillation steps is
+  not zero.
+  """
+  num_steps: int = 10000
+  warmup_steps: int = 10000
+  initial_learning_rate: float = 1.5e-3
+  end_learning_rate: float = 1.5e-3
+  decay_steps: int = 10000
+  hidden_distill_factor: float = 100.0
+  beta_distill_factor: float = 5000.0
+  gamma_distill_factor: float = 5.0
+  attention_distill_factor: float = 1.0
+@dataclasses.dataclass
+class EndToEndDistillationParams(base_config.Config):
+  """Define the behavior of end2end pretrainer distillation."""
+  num_steps: int = 580000
+  warmup_steps: int = 20000
+  initial_learning_rate: float = 1.5e-3
+  end_learning_rate: float = 1.5e-7
+  decay_steps: int = 580000
+  distill_ground_truth_ratio: float = 0.5
+@dataclasses.dataclass
+class EdgeTPUBERTCustomParams(base_config.Config):
+  """EdgeTPU-BERT custom params.
+  Attributes:
+    train_dataset: An instance of the DatasetParams.
+    eval_dataset: An instance of the DatasetParams.
+    teacher_model: An instance of the PretrainerModelParams. If None, then the
+      student model is trained independently without distillation.
+    student_model: An instance of the PretrainerModelParams
+    teacher_model_init_checkpoint: Path for the teacher model init checkpoint.
+    student_model_init_checkpoint: Path for the student model init checkpoint.
+    layer_wise_distillation: Distillation config for the layer-wise step.
+    end_to_end_distillation: Distillation config for the end2end step.
+    optimizer: An instance of the OptimizerParams.
+    runtime: An instance of the RuntimeParams.
+    learning_rate: An instance of the LearningRateParams.
+    orbit_config: An instance of the OrbitParams.
+    distill_ground_truth_ratio: A float number representing the ratio between
+      distillation output and ground truth.
+  """
+  train_datasest: DatasetParams = DatasetParams()
+  eval_dataset: DatasetParams = DatasetParams()
+  teacher_model: Optional[PretrainerModelParams] = PretrainerModelParams()
+  student_model: PretrainerModelParams = PretrainerModelParams()
+  teacher_model_init_checkpoint: str = ''
+  student_model_init_checkpoint: str = ''
+  layer_wise_distillation: LayerWiseDistillationParams = (
+      LayerWiseDistillationParams())
+  end_to_end_distillation: EndToEndDistillationParams = (
+      EndToEndDistillationParams())
+  optimizer: OptimizerParams = OptimizerParams()
+  runtime: RuntimeParams = RuntimeParams()
+  orbit_config: OrbitParams = OrbitParams()
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/glue_mnli.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/glue_mnli.yaml
+task:
+  # hub_module_url: 'gs://**/panzf/mobilebert/tfhub/'
+  init_checkpoint: 'gs://**/edgetpu_bert/edgetpu_bert_float_candidate_13_e2e_820k/exported_ckpt/'
+  model:
+    num_classes: 3
+  metric_type: 'accuracy'
+  train_data:
+    drop_remainder: true
+    global_batch_size: 32
+    input_path: gs://**/yo/bert/glue/tfrecords/MNLI/MNLI_matched_train.tf_record
+    is_training: true
+    seq_length: 128
+    label_type: 'int'
+  validation_data:
+    drop_remainder: false
+    global_batch_size: 32
+    input_path: gs://**/yo/bert/glue/tfrecords/MNLI/MNLI_matched_eval.tf_record
+    is_training: false
+    seq_length: 128
+    label_type: 'int'
+trainer:
+  checkpoint_interval: 10000
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        # 100% of train_steps.
+        decay_steps: 50000
+        end_learning_rate: 0.0
+        initial_learning_rate: 3.0e-05
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        # ~10% of train_steps.
+        warmup_steps: 5000
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  # Training data size 392,702 examples, 8 epochs.
+  train_steps: 50000
+  validation_interval: 2000
+  # Eval data size = 9815 examples.
+  validation_steps: 307
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_eval_metric: 'cls_accuracy'
+  best_checkpoint_metric_comp: 'higher'
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
+# MobileBERT model from https://arxiv.org/abs/2004.02984.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 24
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 512
+        hidden_activation: relu
+        hidden_dropout_prob: 0.0
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 128
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 4
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_m.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_m.yaml
+# MobileBERT-EdgeTPU model.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 12
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 1024
+        hidden_activation: relu
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 256
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 6
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_s.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_s.yaml
+# MobileBERT-EdgeTPU-S model.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 12
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 1024
+        hidden_activation: relu
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 256
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 4
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xs.yaml
+# MobileBERT-EdgeTPU-XS model.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 8
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 1024
+        hidden_activation: relu
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 256
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 4
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/squad_v1.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/squad_v1.yaml
+task:
+  # hub_module_url: 'gs://**/panzf/mobilebert/tfhub/'
+  max_answer_length: 30
+  n_best_size: 20
+  null_score_diff_threshold: 0.0
+  init_checkpoint: 'gs://**/edgetpu_bert/edgetpu_bert_float_candidate_13_e2e_820k/exported_ckpt/'
+  train_data:
+    drop_remainder: true
+    global_batch_size: 32
+    input_path: gs://**/tp/bert/squad_v1.1/train.tf_record
+    is_training: true
+    seq_length: 384
+  validation_data:
+    do_lower_case: true
+    doc_stride: 128
+    drop_remainder: false
+    global_batch_size: 48
+    input_path: gs://**/squad/dev-v1.1.json
+    is_training: false
+    query_length: 64
+    seq_length: 384
+    tokenization: WordPiece
+    version_2_with_negative: false
+    vocab_file: gs://**/panzf/ttl-30d/mobilebert/tf2_checkpoint/vocab.txt
+trainer:
+  checkpoint_interval: 1000
+  max_to_keep: 5
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        decay_steps: 19420
+        end_learning_rate: 0.0
+        initial_learning_rate: 8.0e-05
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        # 10% of total training steps
+        warmup_steps: 1942
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  # 7 epochs for training
+  train_steps: 19420
+  validation_interval: 3000
+  validation_steps: 226
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_eval_metric: 'final_f1'
+  best_checkpoint_metric_comp: 'higher'
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_baseline.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_baseline.yaml
+# Distillation pretraining for Mobilebert.
+# The final MLM accuracy is around 70.8% for e2e only training and 71.4% for layer-wise + e2e.
+layer_wise_distillation:
+  num_steps: 10000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 10000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 512
+      intra_bottleneck_size: 128
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 4
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_m.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_m.yaml
+layer_wise_distillation:
+  num_steps: 20000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 20000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 1024
+      intra_bottleneck_size: 256
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 12
+      num_feedforward_networks: 6
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_s.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_s.yaml
+layer_wise_distillation:
+  num_steps: 20000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 20000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 1024
+      intra_bottleneck_size: 256
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 12
+      num_feedforward_networks: 4
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xs.yaml
+layer_wise_distillation:
+  num_steps: 30000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 30000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 1024
+      intra_bottleneck_size: 256
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 8
+      num_feedforward_networks: 4
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
--- a/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
+++ b/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
--- a/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
+++ b/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
--- a/official/nlp/keras_nlp/layers/on_device_embedding.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding.py
@@ -12,8 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Keras-based one-hot embedding layer."""
-from official.nlp.modeling import layers
-OnDeviceEmbedding = layers.OnDeviceEmbedding