Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
7a45b513
Unverified
Commit
7a45b513
authored
Oct 25, 2021
by
Vishnu Banna
Committed by
GitHub
Oct 25, 2021
Browse files
Merge branch 'tensorflow:master' into exp_pr2
parents
54115e16
12bbefce
Changes
111
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1613 additions
and
17 deletions
+1613
-17
official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
.../projects/teams/experiments/base/wiki_books_pretrain.yaml
+2
-1
official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
...projects/teams/experiments/small/wiki_books_pretrain.yaml
+1
-0
official/nlp/projects/teams/teams.py
official/nlp/projects/teams/teams.py
+4
-4
official/projects/edgetpu/nlp/README.md
official/projects/edgetpu/nlp/README.md
+86
-0
official/projects/edgetpu/nlp/__init__.py
official/projects/edgetpu/nlp/__init__.py
+0
-5
official/projects/edgetpu/nlp/configs/__init__.py
official/projects/edgetpu/nlp/configs/__init__.py
+0
-2
official/projects/edgetpu/nlp/configs/params.py
official/projects/edgetpu/nlp/configs/params.py
+159
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/glue_mnli.yaml
...s/edgetpu/nlp/experiments/downstream_tasks/glue_mnli.yaml
+49
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
...nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
+23
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_m.yaml
...lp/experiments/downstream_tasks/mobilebert_edgetpu_m.yaml
+23
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_s.yaml
...lp/experiments/downstream_tasks/mobilebert_edgetpu_s.yaml
+23
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xs.yaml
...p/experiments/downstream_tasks/mobilebert_edgetpu_xs.yaml
+23
-0
official/projects/edgetpu/nlp/experiments/downstream_tasks/squad_v1.yaml
...ts/edgetpu/nlp/experiments/downstream_tasks/squad_v1.yaml
+52
-0
official/projects/edgetpu/nlp/experiments/mobilebert_baseline.yaml
...projects/edgetpu/nlp/experiments/mobilebert_baseline.yaml
+143
-0
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_m.yaml
...rojects/edgetpu/nlp/experiments/mobilebert_edgetpu_m.yaml
+141
-0
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_s.yaml
...rojects/edgetpu/nlp/experiments/mobilebert_edgetpu_s.yaml
+141
-0
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xs.yaml
...ojects/edgetpu/nlp/experiments/mobilebert_edgetpu_xs.yaml
+141
-0
official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
+519
-0
official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
...l/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
+83
-0
official/projects/edgetpu/nlp/modeling/__init__.py
official/projects/edgetpu/nlp/modeling/__init__.py
+0
-5
No files found.
official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
View file @
7a45b513
...
@@ -27,7 +27,7 @@ task:
...
@@ -27,7 +27,7 @@ task:
intermediate_size
:
3072
intermediate_size
:
3072
max_position_embeddings
:
512
max_position_embeddings
:
512
num_attention_heads
:
12
num_attention_heads
:
12
num_layers
:
6
num_layers
:
12
type_vocab_size
:
2
type_vocab_size
:
2
vocab_size
:
30522
vocab_size
:
30522
train_data
:
train_data
:
...
@@ -39,6 +39,7 @@ task:
...
@@ -39,6 +39,7 @@ task:
seq_length
:
512
seq_length
:
512
use_next_sentence_label
:
false
use_next_sentence_label
:
false
use_position_id
:
false
use_position_id
:
false
cycle_length
:
8
validation_data
:
validation_data
:
drop_remainder
:
true
drop_remainder
:
true
global_batch_size
:
256
global_batch_size
:
256
...
...
official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
View file @
7a45b513
...
@@ -39,6 +39,7 @@ task:
...
@@ -39,6 +39,7 @@ task:
seq_length
:
512
seq_length
:
512
use_next_sentence_label
:
false
use_next_sentence_label
:
false
use_position_id
:
false
use_position_id
:
false
cycle_length
:
8
validation_data
:
validation_data
:
drop_remainder
:
true
drop_remainder
:
true
global_batch_size
:
256
global_batch_size
:
256
...
...
official/nlp/projects/teams/teams.py
View file @
7a45b513
...
@@ -51,9 +51,7 @@ class TeamsPretrainerConfig(base_config.Config):
...
@@ -51,9 +51,7 @@ class TeamsPretrainerConfig(base_config.Config):
@
gin
.
configurable
@
gin
.
configurable
def
get_encoder
(
bert_config
,
def
get_encoder
(
bert_config
,
embedding_network
=
None
,
hidden_layers
=
None
):
embedding_network
=
None
,
hidden_layers
=
layers
.
Transformer
):
"""Gets a 'EncoderScaffold' object.
"""Gets a 'EncoderScaffold' object.
Args:
Args:
...
@@ -85,7 +83,9 @@ def get_encoder(bert_config,
...
@@ -85,7 +83,9 @@ def get_encoder(bert_config,
stddev
=
bert_config
.
initializer_range
),
stddev
=
bert_config
.
initializer_range
),
)
)
if
embedding_network
is
None
:
if
embedding_network
is
None
:
embedding_network
=
networks
.
PackedSequenceEmbedding
(
**
embedding_cfg
)
embedding_network
=
networks
.
PackedSequenceEmbedding
if
hidden_layers
is
None
:
hidden_layers
=
layers
.
Transformer
kwargs
=
dict
(
kwargs
=
dict
(
embedding_cfg
=
embedding_cfg
,
embedding_cfg
=
embedding_cfg
,
embedding_cls
=
embedding_network
,
embedding_cls
=
embedding_network
,
...
...
official/projects/edgetpu/nlp/README.md
0 → 100644
View file @
7a45b513
# MobileBERT-EdgeTPU
<figure
align=
"center"
>
<img
width=
70%
src=
https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-mobilebert.png
>
<figcaption>
Performance of MobileBERT-EdgeTPU models on the SQuAD v1.1 dataset.
</figcaption>
</figure>
Note: For MobileBERT baseline float model, NNAPI delegates parts of the
computing ops to CPU, making the latency much higher.
Note: The accuracy numbers for BERT_base and BERT_large are from the
[
training results
](
https://arxiv.org/abs/1810.04805
)
. These models are too large
and not feasible to run on device.
Deploying low-latency, high-quality transformer based language models on device
is highly desirable, and can potentially benefit multiple applications such as
automatic speech recognition (ASR), translation, sentence autocompletion, and
even some vision tasks. By co-designing the neural networks with the Edge TPU
hardware accelerator in Google Tensor SoC, we have built EdgeTPU-customized
MobileBERT models that demonstrate datacenter model quality meanwhile
outperforms baseline MobileBERT's latency.
We set up our model architecture search space based on
[
MobileBERT
](
https://arxiv.org/abs/2004.02984
)
and leverage AutoML algorithms to
find models with up to 2x better hardware utilization. With higher utilization,
we are able to bring larger and more accurate models on chip, and meanwhile the
models can still outperform the baseline MobileBERT latency. We built a
customized distillation training pipeline and performed exhaustive
hyperparameters (e.g. learning rate, dropout ratio, etc) search to achieve the
best accuracy. As shown in the above figure, the quantized MobileBERT-EdgeTPU
models establish a new pareto-frontier for the question answering tasks and also
exceed the accuracy of the float BERT_base model which is 400+MB and too large
to run on edge devices.
We also observed that, unlike most vision models, the accuracy drops
significantly for MobileBERT/MobileBERT-EdgeTPU with plain post training
quantization (PTQ) or quantization aware training (QAT). Proper model
modifications, such as clipping the mask value, are necessary to retain the
accuracy for a quantized model. Therefore, as an alternative to the quant
models, we also provide a set of Edge TPU friendly float models which also
produce a better (though marginally) roofline than the baseline MobileBERT quant
model. Notably, the float MobileBERT-EdgeTPU-M model yields accuracy that is
even close to the BERT_large, which has 1.3GB model size in float precision.
Quantization now becomes an optional optimization rather than a prerequisite,
which can greatly benefit/unblock some use cases where quantization is
infeasible or introduce large accuracy deterioration, and potentially reduce the
time-to-market.
## Pre-trained Models
Model name | # Parameters | # Ops | MLM | Checkpoint | TFhub link
--------------------- | :----------: | :----: | :---: | :---: | :--------:
MobileBERT-EdgeTPU-M | 50.9M | 18.8e9 | 73.8% | WIP | WIP
MobileBERT-EdgeTPU-S | 38.3M | 14.0e9 | 72.8% | WIP | WIP
MobileBERT-EdgeTPU-XS | 27.1M | 9.4e9 | 71.2% | WIP | WIP
### Restoring from Checkpoints
To load the pre-trained MobileBERT checkpoint in your code, please follow the
example below or check the
`serving/export_tflite_squad`
module:
```
python
import
tensorflow
as
tf
from
official.nlp.projects.mobilebert_edgetpu
import
params
bert_config_file
=
...
model_checkpoint_path
=
...
# Set up experiment params and load the configs from file/files.
experiment_params
=
params
.
EdgeTPUBERTCustomParams
()
# change the input mask type to tf.float32 to avoid additional casting op.
experiment_params
.
student_model
.
encoder
.
mobilebert
.
input_mask_dtype
=
'float32'
pretrainer_model
=
model_builder
.
build_bert_pretrainer
(
experiment_params
.
student_model
,
name
=
'pretrainer'
,
quantization_friendly
=
True
)
checkpoint_dict
=
{
'model'
:
pretrainer_model
}
checkpoint
=
tf
.
train
.
Checkpoint
(
**
checkpoint_dict
)
checkpoint
.
restore
(
FLAGS
.
model_checkpoint
).
assert_existing_objects_matched
()
```
### Use TF-Hub models
TODO(longy): Update with instructions to use tf-hub models
official/
nlp/keras_nlp/layers/masked_lm
.py
→
official/
projects/edgetpu/nlp/__init__
.py
View file @
7a45b513
...
@@ -12,8 +12,3 @@
...
@@ -12,8 +12,3 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Masked language model network."""
from
official.nlp.modeling
import
layers
MaskedLM
=
layers
.
MaskedLM
official/
nlp/keras_nlp/encoder
s/__init__.py
→
official/
projects/edgetpu/nlp/config
s/__init__.py
View file @
7a45b513
...
@@ -12,5 +12,3 @@
...
@@ -12,5 +12,3 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Keras-NLP layers package definition."""
from
official.nlp.keras_nlp.encoders.bert_encoder
import
BertEncoder
official/projects/edgetpu/nlp/configs/params.py
0 → 100644
View file @
7a45b513
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Datastructures for all the configurations for MobileBERT-EdgeTPU training."""
import
dataclasses
from
typing
import
Optional
from
official.modeling
import
optimization
from
official.modeling.hyperparams
import
base_config
from
official.nlp.configs
import
bert
from
official.nlp.data
import
pretrain_dataloader
DatasetParams
=
pretrain_dataloader
.
BertPretrainDataConfig
PretrainerModelParams
=
bert
.
PretrainerConfig
@
dataclasses
.
dataclass
class
OrbitParams
(
base_config
.
Config
):
"""Parameters that setup Orbit training/evaluation pipeline.
Attributes:
mode: Orbit controller mode, can be 'train', 'train_and_evaluate', or
'evaluate'.
steps_per_loop: The number of steps to run in each inner loop of training.
total_steps: The global step count to train up to.
eval_steps: The number of steps to run during an evaluation. If -1, this
method will evaluate over the entire evaluation dataset.
eval_interval: The number of training steps to run between evaluations. If
set, training will always stop every `eval_interval` steps, even if this
results in a shorter inner loop than specified by `steps_per_loop`
setting. If None, evaluation will only be performed after training is
complete.
"""
mode
:
str
=
'train'
steps_per_loop
:
int
=
1000
total_steps
:
int
=
1000000
eval_steps
:
int
=
-
1
eval_interval
:
Optional
[
int
]
=
None
@
dataclasses
.
dataclass
class
OptimizerParams
(
optimization
.
OptimizationConfig
):
"""Optimizer parameters for MobileBERT-EdgeTPU."""
optimizer
:
optimization
.
OptimizerConfig
=
optimization
.
OptimizerConfig
(
type
=
'adamw'
,
adamw
=
optimization
.
AdamWeightDecayConfig
(
weight_decay_rate
=
0.01
,
exclude_from_weight_decay
=
[
'LayerNorm'
,
'layer_norm'
,
'bias'
]))
learning_rate
:
optimization
.
LrConfig
=
optimization
.
LrConfig
(
type
=
'polynomial'
,
polynomial
=
optimization
.
PolynomialLrConfig
(
initial_learning_rate
=
1e-4
,
decay_steps
=
1000000
,
end_learning_rate
=
0.0
))
warmup
:
optimization
.
WarmupConfig
=
optimization
.
WarmupConfig
(
type
=
'polynomial'
,
polynomial
=
optimization
.
PolynomialWarmupConfig
(
warmup_steps
=
10000
))
@
dataclasses
.
dataclass
class
RuntimeParams
(
base_config
.
Config
):
"""Parameters that set up the training runtime.
TODO(longy): Can reuse the Runtime Config in:
official/core/config_definitions.py
Attributes
distribution_strategy: Keras distribution strategy
use_gpu: Whether to use GPU
use_tpu: Whether to use TPU
num_gpus: Number of gpus to use for training
num_workers: Number of parallel workers
tpu_address: The bns address of the TPU to use.
"""
distribution_strategy
:
str
=
'off'
num_gpus
:
Optional
[
int
]
=
0
all_reduce_alg
:
Optional
[
str
]
=
None
num_workers
:
int
=
1
tpu_address
:
str
=
''
use_gpu
:
Optional
[
bool
]
=
None
use_tpu
:
Optional
[
bool
]
=
None
@
dataclasses
.
dataclass
class
LayerWiseDistillationParams
(
base_config
.
Config
):
"""Define the behavior of layer-wise distillation.
Layer-wise distillation is an optional step where the knowledge is transferred
layerwisely for all the transformer layers. The end-to-end distillation is
performed after layer-wise distillation if layer-wise distillation steps is
not zero.
"""
num_steps
:
int
=
10000
warmup_steps
:
int
=
10000
initial_learning_rate
:
float
=
1.5e-3
end_learning_rate
:
float
=
1.5e-3
decay_steps
:
int
=
10000
hidden_distill_factor
:
float
=
100.0
beta_distill_factor
:
float
=
5000.0
gamma_distill_factor
:
float
=
5.0
attention_distill_factor
:
float
=
1.0
@
dataclasses
.
dataclass
class
EndToEndDistillationParams
(
base_config
.
Config
):
"""Define the behavior of end2end pretrainer distillation."""
num_steps
:
int
=
580000
warmup_steps
:
int
=
20000
initial_learning_rate
:
float
=
1.5e-3
end_learning_rate
:
float
=
1.5e-7
decay_steps
:
int
=
580000
distill_ground_truth_ratio
:
float
=
0.5
@
dataclasses
.
dataclass
class
EdgeTPUBERTCustomParams
(
base_config
.
Config
):
"""EdgeTPU-BERT custom params.
Attributes:
train_dataset: An instance of the DatasetParams.
eval_dataset: An instance of the DatasetParams.
teacher_model: An instance of the PretrainerModelParams. If None, then the
student model is trained independently without distillation.
student_model: An instance of the PretrainerModelParams
teacher_model_init_checkpoint: Path for the teacher model init checkpoint.
student_model_init_checkpoint: Path for the student model init checkpoint.
layer_wise_distillation: Distillation config for the layer-wise step.
end_to_end_distillation: Distillation config for the end2end step.
optimizer: An instance of the OptimizerParams.
runtime: An instance of the RuntimeParams.
learning_rate: An instance of the LearningRateParams.
orbit_config: An instance of the OrbitParams.
distill_ground_truth_ratio: A float number representing the ratio between
distillation output and ground truth.
"""
train_datasest
:
DatasetParams
=
DatasetParams
()
eval_dataset
:
DatasetParams
=
DatasetParams
()
teacher_model
:
Optional
[
PretrainerModelParams
]
=
PretrainerModelParams
()
student_model
:
PretrainerModelParams
=
PretrainerModelParams
()
teacher_model_init_checkpoint
:
str
=
''
student_model_init_checkpoint
:
str
=
''
layer_wise_distillation
:
LayerWiseDistillationParams
=
(
LayerWiseDistillationParams
())
end_to_end_distillation
:
EndToEndDistillationParams
=
(
EndToEndDistillationParams
())
optimizer
:
OptimizerParams
=
OptimizerParams
()
runtime
:
RuntimeParams
=
RuntimeParams
()
orbit_config
:
OrbitParams
=
OrbitParams
()
official/projects/edgetpu/nlp/experiments/downstream_tasks/glue_mnli.yaml
0 → 100644
View file @
7a45b513
task
:
# hub_module_url: 'gs://**/panzf/mobilebert/tfhub/'
init_checkpoint
:
'
gs://**/edgetpu_bert/edgetpu_bert_float_candidate_13_e2e_820k/exported_ckpt/'
model
:
num_classes
:
3
metric_type
:
'
accuracy'
train_data
:
drop_remainder
:
true
global_batch_size
:
32
input_path
:
gs://**/yo/bert/glue/tfrecords/MNLI/MNLI_matched_train.tf_record
is_training
:
true
seq_length
:
128
label_type
:
'
int'
validation_data
:
drop_remainder
:
false
global_batch_size
:
32
input_path
:
gs://**/yo/bert/glue/tfrecords/MNLI/MNLI_matched_eval.tf_record
is_training
:
false
seq_length
:
128
label_type
:
'
int'
trainer
:
checkpoint_interval
:
10000
optimizer_config
:
learning_rate
:
polynomial
:
# 100% of train_steps.
decay_steps
:
50000
end_learning_rate
:
0.0
initial_learning_rate
:
3.0e-05
power
:
1.0
type
:
polynomial
optimizer
:
type
:
adamw
warmup
:
polynomial
:
power
:
1
# ~10% of train_steps.
warmup_steps
:
5000
type
:
polynomial
steps_per_loop
:
1000
summary_interval
:
1000
# Training data size 392,702 examples, 8 epochs.
train_steps
:
50000
validation_interval
:
2000
# Eval data size = 9815 examples.
validation_steps
:
307
best_checkpoint_export_subdir
:
'
best_ckpt'
best_checkpoint_eval_metric
:
'
cls_accuracy'
best_checkpoint_metric_comp
:
'
higher'
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
0 → 100644
View file @
7a45b513
# MobileBERT model from https://arxiv.org/abs/2004.02984.
task
:
model
:
encoder
:
type
:
mobilebert
mobilebert
:
word_vocab_size
:
30522
word_embed_size
:
128
type_vocab_size
:
2
max_sequence_length
:
512
num_blocks
:
24
hidden_size
:
512
num_attention_heads
:
4
intermediate_size
:
512
hidden_activation
:
relu
hidden_dropout_prob
:
0.0
attention_probs_dropout_prob
:
0.1
intra_bottleneck_size
:
128
initializer_range
:
0.02
key_query_shared_bottleneck
:
true
num_feedforward_networks
:
4
normalization_type
:
no_norm
classifier_activation
:
false
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_m.yaml
0 → 100644
View file @
7a45b513
# MobileBERT-EdgeTPU model.
task
:
model
:
encoder
:
type
:
mobilebert
mobilebert
:
word_vocab_size
:
30522
word_embed_size
:
128
type_vocab_size
:
2
max_sequence_length
:
512
num_blocks
:
12
hidden_size
:
512
num_attention_heads
:
4
intermediate_size
:
1024
hidden_activation
:
relu
hidden_dropout_prob
:
0.1
attention_probs_dropout_prob
:
0.1
intra_bottleneck_size
:
256
initializer_range
:
0.02
key_query_shared_bottleneck
:
true
num_feedforward_networks
:
6
normalization_type
:
no_norm
classifier_activation
:
false
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_s.yaml
0 → 100644
View file @
7a45b513
# MobileBERT-EdgeTPU-S model.
task
:
model
:
encoder
:
type
:
mobilebert
mobilebert
:
word_vocab_size
:
30522
word_embed_size
:
128
type_vocab_size
:
2
max_sequence_length
:
512
num_blocks
:
12
hidden_size
:
512
num_attention_heads
:
4
intermediate_size
:
1024
hidden_activation
:
relu
hidden_dropout_prob
:
0.1
attention_probs_dropout_prob
:
0.1
intra_bottleneck_size
:
256
initializer_range
:
0.02
key_query_shared_bottleneck
:
true
num_feedforward_networks
:
4
normalization_type
:
no_norm
classifier_activation
:
false
official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xs.yaml
0 → 100644
View file @
7a45b513
# MobileBERT-EdgeTPU-XS model.
task
:
model
:
encoder
:
type
:
mobilebert
mobilebert
:
word_vocab_size
:
30522
word_embed_size
:
128
type_vocab_size
:
2
max_sequence_length
:
512
num_blocks
:
8
hidden_size
:
512
num_attention_heads
:
4
intermediate_size
:
1024
hidden_activation
:
relu
hidden_dropout_prob
:
0.1
attention_probs_dropout_prob
:
0.1
intra_bottleneck_size
:
256
initializer_range
:
0.02
key_query_shared_bottleneck
:
true
num_feedforward_networks
:
4
normalization_type
:
no_norm
classifier_activation
:
false
official/projects/edgetpu/nlp/experiments/downstream_tasks/squad_v1.yaml
0 → 100644
View file @
7a45b513
task
:
# hub_module_url: 'gs://**/panzf/mobilebert/tfhub/'
max_answer_length
:
30
n_best_size
:
20
null_score_diff_threshold
:
0.0
init_checkpoint
:
'
gs://**/edgetpu_bert/edgetpu_bert_float_candidate_13_e2e_820k/exported_ckpt/'
train_data
:
drop_remainder
:
true
global_batch_size
:
32
input_path
:
gs://**/tp/bert/squad_v1.1/train.tf_record
is_training
:
true
seq_length
:
384
validation_data
:
do_lower_case
:
true
doc_stride
:
128
drop_remainder
:
false
global_batch_size
:
48
input_path
:
gs://**/squad/dev-v1.1.json
is_training
:
false
query_length
:
64
seq_length
:
384
tokenization
:
WordPiece
version_2_with_negative
:
false
vocab_file
:
gs://**/panzf/ttl-30d/mobilebert/tf2_checkpoint/vocab.txt
trainer
:
checkpoint_interval
:
1000
max_to_keep
:
5
optimizer_config
:
learning_rate
:
polynomial
:
decay_steps
:
19420
end_learning_rate
:
0.0
initial_learning_rate
:
8.0e-05
power
:
1.0
type
:
polynomial
optimizer
:
type
:
adamw
warmup
:
polynomial
:
power
:
1
# 10% of total training steps
warmup_steps
:
1942
type
:
polynomial
steps_per_loop
:
1000
summary_interval
:
1000
# 7 epochs for training
train_steps
:
19420
validation_interval
:
3000
validation_steps
:
226
best_checkpoint_export_subdir
:
'
best_ckpt'
best_checkpoint_eval_metric
:
'
final_f1'
best_checkpoint_metric_comp
:
'
higher'
official/projects/edgetpu/nlp/experiments/mobilebert_baseline.yaml
0 → 100644
View file @
7a45b513
# Distillation pretraining for Mobilebert.
# The final MLM accuracy is around 70.8% for e2e only training and 71.4% for layer-wise + e2e.
layer_wise_distillation
:
num_steps
:
10000
warmup_steps
:
0
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-3
decay_steps
:
10000
end_to_end_distillation
:
num_steps
:
585000
warmup_steps
:
20000
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-7
decay_steps
:
585000
distill_ground_truth_ratio
:
0.5
optimizer
:
optimizer
:
lamb
:
beta_1
:
0.9
beta_2
:
0.999
clipnorm
:
1.0
epsilon
:
1.0e-06
exclude_from_layer_adaptation
:
null
exclude_from_weight_decay
:
[
'
LayerNorm'
,
'
bias'
,
'
norm'
]
global_clipnorm
:
null
name
:
LAMB
weight_decay_rate
:
0.01
type
:
lamb
orbit_config
:
eval_interval
:
1000
eval_steps
:
-1
mode
:
train
steps_per_loop
:
1000
total_steps
:
825000
runtime
:
distribution_strategy
:
'
tpu'
student_model
:
cls_heads
:
[{
'
activation'
:
'
tanh'
,
'
cls_token_idx'
:
0
,
'
dropout_rate'
:
0.0
,
'
inner_dim'
:
512
,
'
name'
:
'
next_sentence'
,
'
num_classes'
:
2
}]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
relu
hidden_dropout_prob
:
0.0
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
512
intra_bottleneck_size
:
128
key_query_shared_bottleneck
:
true
max_sequence_length
:
512
normalization_type
:
no_norm
num_attention_heads
:
4
num_blocks
:
24
num_feedforward_networks
:
4
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
relu
mlm_initializer_range
:
0.02
teacher_model
:
cls_heads
:
[]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
gelu
hidden_dropout_prob
:
0.1
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
4096
intra_bottleneck_size
:
1024
key_query_shared_bottleneck
:
false
max_sequence_length
:
512
normalization_type
:
layer_norm
num_attention_heads
:
4
num_blocks
:
24
num_feedforward_networks
:
1
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
gelu
mlm_initializer_range
:
0.02
teacher_model_init_checkpoint
:
gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
student_model_init_checkpoint
:
'
'
train_datasest
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
is_training
:
true
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
eval_dataset
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
is_training
:
false
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_m.yaml
0 → 100644
View file @
7a45b513
layer_wise_distillation
:
num_steps
:
20000
warmup_steps
:
0
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-3
decay_steps
:
20000
end_to_end_distillation
:
num_steps
:
585000
warmup_steps
:
20000
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-7
decay_steps
:
585000
distill_ground_truth_ratio
:
0.5
optimizer
:
optimizer
:
lamb
:
beta_1
:
0.9
beta_2
:
0.999
clipnorm
:
1.0
epsilon
:
1.0e-06
exclude_from_layer_adaptation
:
null
exclude_from_weight_decay
:
[
'
LayerNorm'
,
'
bias'
,
'
norm'
]
global_clipnorm
:
null
name
:
LAMB
weight_decay_rate
:
0.01
type
:
lamb
orbit_config
:
eval_interval
:
1000
eval_steps
:
-1
mode
:
train
steps_per_loop
:
1000
total_steps
:
825000
runtime
:
distribution_strategy
:
'
tpu'
student_model
:
cls_heads
:
[{
'
activation'
:
'
tanh'
,
'
cls_token_idx'
:
0
,
'
dropout_rate'
:
0.0
,
'
inner_dim'
:
512
,
'
name'
:
'
next_sentence'
,
'
num_classes'
:
2
}]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
relu
hidden_dropout_prob
:
0.0
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
1024
intra_bottleneck_size
:
256
key_query_shared_bottleneck
:
true
max_sequence_length
:
512
normalization_type
:
no_norm
num_attention_heads
:
4
num_blocks
:
12
num_feedforward_networks
:
6
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
relu
mlm_initializer_range
:
0.02
teacher_model
:
cls_heads
:
[]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
gelu
hidden_dropout_prob
:
0.1
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
4096
intra_bottleneck_size
:
1024
key_query_shared_bottleneck
:
false
max_sequence_length
:
512
normalization_type
:
layer_norm
num_attention_heads
:
4
num_blocks
:
24
num_feedforward_networks
:
1
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
gelu
mlm_initializer_range
:
0.02
teacher_model_init_checkpoint
:
gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
student_model_init_checkpoint
:
'
'
train_datasest
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
is_training
:
true
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
eval_dataset
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
is_training
:
false
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_s.yaml
0 → 100644
View file @
7a45b513
layer_wise_distillation
:
num_steps
:
20000
warmup_steps
:
0
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-3
decay_steps
:
20000
end_to_end_distillation
:
num_steps
:
585000
warmup_steps
:
20000
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-7
decay_steps
:
585000
distill_ground_truth_ratio
:
0.5
optimizer
:
optimizer
:
lamb
:
beta_1
:
0.9
beta_2
:
0.999
clipnorm
:
1.0
epsilon
:
1.0e-06
exclude_from_layer_adaptation
:
null
exclude_from_weight_decay
:
[
'
LayerNorm'
,
'
bias'
,
'
norm'
]
global_clipnorm
:
null
name
:
LAMB
weight_decay_rate
:
0.01
type
:
lamb
orbit_config
:
eval_interval
:
1000
eval_steps
:
-1
mode
:
train
steps_per_loop
:
1000
total_steps
:
825000
runtime
:
distribution_strategy
:
'
tpu'
student_model
:
cls_heads
:
[{
'
activation'
:
'
tanh'
,
'
cls_token_idx'
:
0
,
'
dropout_rate'
:
0.0
,
'
inner_dim'
:
512
,
'
name'
:
'
next_sentence'
,
'
num_classes'
:
2
}]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
relu
hidden_dropout_prob
:
0.0
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
1024
intra_bottleneck_size
:
256
key_query_shared_bottleneck
:
true
max_sequence_length
:
512
normalization_type
:
no_norm
num_attention_heads
:
4
num_blocks
:
12
num_feedforward_networks
:
4
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
relu
mlm_initializer_range
:
0.02
teacher_model
:
cls_heads
:
[]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
gelu
hidden_dropout_prob
:
0.1
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
4096
intra_bottleneck_size
:
1024
key_query_shared_bottleneck
:
false
max_sequence_length
:
512
normalization_type
:
layer_norm
num_attention_heads
:
4
num_blocks
:
24
num_feedforward_networks
:
1
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
gelu
mlm_initializer_range
:
0.02
teacher_model_init_checkpoint
:
gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
student_model_init_checkpoint
:
'
'
train_datasest
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
is_training
:
true
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
eval_dataset
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
is_training
:
false
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xs.yaml
0 → 100644
View file @
7a45b513
layer_wise_distillation
:
num_steps
:
30000
warmup_steps
:
0
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-3
decay_steps
:
30000
end_to_end_distillation
:
num_steps
:
585000
warmup_steps
:
20000
initial_learning_rate
:
1.5e-3
end_learning_rate
:
1.5e-7
decay_steps
:
585000
distill_ground_truth_ratio
:
0.5
optimizer
:
optimizer
:
lamb
:
beta_1
:
0.9
beta_2
:
0.999
clipnorm
:
1.0
epsilon
:
1.0e-06
exclude_from_layer_adaptation
:
null
exclude_from_weight_decay
:
[
'
LayerNorm'
,
'
bias'
,
'
norm'
]
global_clipnorm
:
null
name
:
LAMB
weight_decay_rate
:
0.01
type
:
lamb
orbit_config
:
eval_interval
:
1000
eval_steps
:
-1
mode
:
train
steps_per_loop
:
1000
total_steps
:
825000
runtime
:
distribution_strategy
:
'
tpu'
student_model
:
cls_heads
:
[{
'
activation'
:
'
tanh'
,
'
cls_token_idx'
:
0
,
'
dropout_rate'
:
0.0
,
'
inner_dim'
:
512
,
'
name'
:
'
next_sentence'
,
'
num_classes'
:
2
}]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
relu
hidden_dropout_prob
:
0.0
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
1024
intra_bottleneck_size
:
256
key_query_shared_bottleneck
:
true
max_sequence_length
:
512
normalization_type
:
no_norm
num_attention_heads
:
4
num_blocks
:
8
num_feedforward_networks
:
4
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
relu
mlm_initializer_range
:
0.02
teacher_model
:
cls_heads
:
[]
encoder
:
mobilebert
:
attention_probs_dropout_prob
:
0.1
classifier_activation
:
false
hidden_activation
:
gelu
hidden_dropout_prob
:
0.1
hidden_size
:
512
initializer_range
:
0.02
input_mask_dtype
:
int32
intermediate_size
:
4096
intra_bottleneck_size
:
1024
key_query_shared_bottleneck
:
false
max_sequence_length
:
512
normalization_type
:
layer_norm
num_attention_heads
:
4
num_blocks
:
24
num_feedforward_networks
:
1
type_vocab_size
:
2
use_bottleneck_attention
:
false
word_embed_size
:
128
word_vocab_size
:
30522
type
:
mobilebert
mlm_activation
:
gelu
mlm_initializer_range
:
0.02
teacher_model_init_checkpoint
:
gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
student_model_init_checkpoint
:
'
'
train_datasest
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
is_training
:
true
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
eval_dataset
:
block_length
:
1
cache
:
false
cycle_length
:
null
deterministic
:
null
drop_remainder
:
true
enable_tf_data_service
:
false
global_batch_size
:
2048
input_path
:
gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
is_training
:
false
max_predictions_per_seq
:
20
seq_length
:
512
sharding
:
true
shuffle_buffer_size
:
100
tf_data_service_address
:
null
tf_data_service_job_name
:
null
tfds_as_supervised
:
false
tfds_data_dir
:
'
'
tfds_name
:
'
'
tfds_skip_decoding_feature
:
'
'
tfds_split
:
'
'
use_next_sentence_label
:
true
use_position_id
:
false
use_v2_feature_names
:
false
official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
0 → 100644
View file @
7a45b513
This diff is collapsed.
Click to expand it.
official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
0 → 100644
View file @
7a45b513
This diff is collapsed.
Click to expand it.
official/
nlp/keras_nlp/layers/on_device_embedding
.py
→
official/
projects/edgetpu/nlp/modeling/__init__
.py
View file @
7a45b513
...
@@ -12,8 +12,3 @@
...
@@ -12,8 +12,3 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Keras-based one-hot embedding layer."""
from
official.nlp.modeling
import
layers
OnDeviceEmbedding
=
layers
.
OnDeviceEmbedding
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment