"...roberta/git@developer.sourcefind.cn:OpenDAS/fairseq.git" did not exist on "6ce55e4b011275e43404034832b40648b1483ff6"
Commit 79a04511 authored by Renjie Liu's avatar Renjie Liu Committed by A. Unique TensorFlower
Browse files

Add mobilebert xxs config.

After quantization, the model size is ~9.2MB (30k vocab size)

For Squad task: the accuracy is : {"f1": 80.40850341559828, "exact_match": 71.33396404919584}

PiperOrigin-RevId: 429223193
parent 6f441f78
# MobileBERT-EdgeTPU-XS model.
task:
model:
encoder:
type: mobilebert
mobilebert:
word_vocab_size: 30522
word_embed_size: 128
type_vocab_size: 2
max_sequence_length: 512
num_blocks: 6
hidden_size: 512
num_attention_heads: 4
intermediate_size: 1024
hidden_activation: relu
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
intra_bottleneck_size: 128
initializer_range: 0.02
key_query_shared_bottleneck: true
num_feedforward_networks: 2
normalization_type: no_norm
classifier_activation: false
layer_wise_distillation:
num_steps: 30000
warmup_steps: 0
initial_learning_rate: 1.5e-3
end_learning_rate: 1.5e-3
decay_steps: 30000
end_to_end_distillation:
num_steps: 585000
warmup_steps: 20000
initial_learning_rate: 1.5e-3
end_learning_rate: 1.5e-7
decay_steps: 585000
distill_ground_truth_ratio: 0.5
optimizer:
optimizer:
lamb:
beta_1: 0.9
beta_2: 0.999
clipnorm: 1.0
epsilon: 1.0e-06
exclude_from_layer_adaptation: null
exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
global_clipnorm: null
name: LAMB
weight_decay_rate: 0.01
type: lamb
orbit_config:
eval_interval: 1000
eval_steps: -1
mode: train
steps_per_loop: 1000
total_steps: 825000
runtime:
distribution_strategy: 'tpu'
student_model:
cls_heads: [{'activation': 'tanh',
'cls_token_idx': 0,
'dropout_rate': 0.0,
'inner_dim': 512,
'name': 'next_sentence',
'num_classes': 2}]
encoder:
mobilebert:
attention_probs_dropout_prob: 0.1
classifier_activation: false
hidden_activation: relu
hidden_dropout_prob: 0.0
hidden_size: 512
initializer_range: 0.02
input_mask_dtype: int32
intermediate_size: 1024
intra_bottleneck_size: 128
key_query_shared_bottleneck: true
max_sequence_length: 512
normalization_type: no_norm
num_attention_heads: 4
num_blocks: 6
num_feedforward_networks: 2
type_vocab_size: 2
use_bottleneck_attention: false
word_embed_size: 128
word_vocab_size: 30522
type: mobilebert
mlm_activation: relu
mlm_initializer_range: 0.02
teacher_model:
cls_heads: []
encoder:
mobilebert:
attention_probs_dropout_prob: 0.1
classifier_activation: false
hidden_activation: gelu
hidden_dropout_prob: 0.1
hidden_size: 512
initializer_range: 0.02
input_mask_dtype: int32
intermediate_size: 4096
intra_bottleneck_size: 1024
key_query_shared_bottleneck: false
max_sequence_length: 512
normalization_type: layer_norm
num_attention_heads: 4
num_blocks: 24
num_feedforward_networks: 1
type_vocab_size: 2
use_bottleneck_attention: false
word_embed_size: 128
word_vocab_size: 30522
type: mobilebert
mlm_activation: gelu
mlm_initializer_range: 0.02
teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
student_model_init_checkpoint: ''
train_datasest:
block_length: 1
cache: false
cycle_length: null
deterministic: null
drop_remainder: true
enable_tf_data_service: false
global_batch_size: 2048
input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
is_training: true
max_predictions_per_seq: 20
seq_length: 512
sharding: true
shuffle_buffer_size: 100
tf_data_service_address: null
tf_data_service_job_name: null
tfds_as_supervised: false
tfds_data_dir: ''
tfds_name: ''
tfds_skip_decoding_feature: ''
tfds_split: ''
use_next_sentence_label: true
use_position_id: false
use_v2_feature_names: false
eval_dataset:
block_length: 1
cache: false
cycle_length: null
deterministic: null
drop_remainder: true
enable_tf_data_service: false
global_batch_size: 2048
input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
is_training: false
max_predictions_per_seq: 20
seq_length: 512
sharding: true
shuffle_buffer_size: 100
tf_data_service_address: null
tf_data_service_job_name: null
tfds_as_supervised: false
tfds_data_dir: ''
tfds_name: ''
tfds_skip_decoding_feature: ''
tfds_split: ''
use_next_sentence_label: true
use_position_id: false
use_v2_feature_names: false
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment