Add mobilebert xxs config.

After quantization, the model size is ~9.2MB (30k vocab size) For Squad task: the accuracy is : {"f1": 80.40850341559828, "exact_match": 71.33396404919584} PiperOrigin-RevId: 429223193

Add mobilebert xxs config.
After quantization, the model size is ~9.2MB (30k vocab size) For Squad task: the accuracy is : {"f1": 80.40850341559828, "exact_match": 71.33396404919584} PiperOrigin-RevId: 429223193
79a04511 · Renjie Liu · A. Unique TensorFlower · 6f441f78 · 79a04511 · 79a04511
Commit 79a04511 authored Feb 16, 2022 by Renjie Liu Committed by A. Unique TensorFlower Feb 16, 2022
2 changed files
--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xxs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xxs.yaml
+# MobileBERT-EdgeTPU-XS model.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 6
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 1024
+        hidden_activation: relu
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 128
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 2
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xxs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xxs.yaml
+layer_wise_distillation:
+  num_steps: 30000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 30000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 1024
+      intra_bottleneck_size: 128
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 6
+      num_feedforward_networks: 2
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false