初始化仓库

8a802023 · hepj987 · 8a802023 · 8a802023 · 8a802023 · 8a802023
Commit 8a802023 authored Jul 03, 2023 by hepj987
20 changed files
--- a/README.md
+++ b/README.md
+# 测试前准备
+
+## 1.数据集准备
+
+GLUE数据集下载https://pan.baidu.com/s/1tLd8opr08Nw5PzUBh7lXsQ
+
+分类使用其中的MNLI数据集
+
+提取码：fyvy
+
+问答数据：
+
+[train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+
+[dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+
+[evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+
+## 2.环境部署
+
+```
+virtualenv -p python3 -system-site-packages venv_2
+source venv_2/bin/activat
+```
+
+安装python依赖包
+
+```
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install tensorflow-2.7.0-cp36-cp36m-linux_x86_64.whl
+pip install horovod-0.21.3-cp36-cp36m-linux_x86_64.whl
+pip install apex-0.1-cp36-cp36m-linux_x86_64.whl
+```
+
+环境变量设置
+
+```
+module rm compiler/rocm/2.9
+export ROCM_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1
+export HIP_PATH=${ROCM_PATH}/hip
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
+```
+
+##  3.MNLI分类测试
+
+###  3.1单卡测试（单精度）
+
+####  3.1.1数据转化
+
+TF2.0版本读取数据方式与TF1.0不同，需要转化为tf_record格式
+
+```
+python ../data/create_finetuning_data.py \
+ --input_data_dir=/public/home/hepj/data/MNLI \
+ --vocab_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/vocab.txt \
+ --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+ --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
+ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
+ --fine_tuning_task_type=classification 
+ --max_seq_length=32 \
+ --classification_task_name=MNLI
+```
+
+#### 3.1.2   模型转化
+
+TF2.7.2与TF1.15.0模型存储、读取格式不同，官网给出的Bert一般是基于TF1.0的模型需要进行模型转化
+
+```
+python3 tf2_encoder_checkpoint_converter.py \
+--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
+--checkpoint_to_convert /public/home/hepjl/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
+--converted_checkpoint_path pre_tf2x/
+```
+
+#### 3.1.3    bert_class.sh
+
+```
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_classifier.py \
+  --mode=train_and_eval \
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+  --eval_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
+  --bert_config_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_config.json \
+  --init_checkpoint=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_model.ckpt \
+  --train_batch_size= 320 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=/public/home/hepj/model/tf2/out1 \
+  --distribution_strategy=mirrored
+```
+
+#### 3.1.4  运行
+
+sh bert_class.sh
+
+### 3.2    四卡测试（单精度）
+
+#### 3.2.1.     数据转化
+
+与单卡相同（3.1.1）
+
+####  3.2.2.     模型转化
+
+与单卡相同（3.1.2）
+
+#### 3.2.3.   bert_class4.sh
+
+```
+#这里的--train_batch_size为global train_batch_size
+#使用mpirun的方式启动多卡存在一些问题
+export HIP_VISIBLE_DEVICES=0,1,2,3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_classifier.py \
+  --mode=train_and_eval \
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data  \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+  --eval_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record  \
+  --bert_config_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_config.json \
+  --init_checkpoint=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_model.ckpt \
+  --train_batch_size=1280 \
+  --eval_batch_size=32 \
+  --steps_per_loop=10 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --num_gpus=4 \
+  --model_dir=/public/home/hepj/outdir/tf2/class4 \
+  --distribution_strategy=mirrored
+```
+
+#### 3.2.4.     运行
+
+```
+sh bert_class4.sh
+```
+
+
+
+##  4. SQUAD1.1问答测试
+
+### 4.1.     单卡测试（单精度)
+
+#### 4.1.1.     数据转化
+
+```
+python3 create_finetuning_data.py \
+ --squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \
+ --vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
+ --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \
+ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data_new \
+ --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \
+ --fine_tuning_task_type=squad \
+ --do_lower_case=Flase \
+ --max_seq_length=384
+```
+
+#### 4.1.2.     模型转化
+
+```
+python3 tf2_encoder_checkpoint_converter.py \
+--bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
+--checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \
+--converted_checkpoint_path  /public/home/hepj/model_source/bert-large-uncased-TF2/
+```
+
+#### 4.1.3.     bert_squad.sh
+
+```
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_squad_xuan.py \
+--mode=train_and_eval \
+--vocab_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/vocab.txt \
+--bert_config_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
+--input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
+--train_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
+--predict_file=/public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
+--init_checkpoint=/public/home/hepj/model_source/bert-large-uncased-TF2/bert_model.ckpt \
+--train_batch_size=4 \
+--predict_batch_size=4 \
+--learning_rate=2e-5 \
+--log_steps=1 \
+--num_gpus=1 \
+--distribution_strategy=mirrored \
+--model_dir=/public/home/hepj/model/tf2/squad1 \
+--run_eagerly=False
+```
+
+#### 4.1.4.     运行
+
+```
+sh bert_squad.sh
+```
+
+### 4.2.     四卡测试（单精度）
+
+#### 4.2.1.     数据转化
+
+与单卡相同（4.1.1）
+
+#### 4.2.2.     模型转化
+
+与单卡相同（4.1.2）
+
+#### 4.2.3.     bert_squad4.sh
+
+```
+#这里的--train_batch_size为global train_batch_size
+#使用mpirun的方式启动多卡存在一些问题
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_squad_xuan.py \
+  --mode=train_and_eval \
+  --vocab_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/vocab.txt \ 
+  --bert_config_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \ 
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data  \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record  \
+  --predict_file=/public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \ 
+  --init_checkpoint=/public/home/hepj/model_source/bert-large-uncased-TF2/bert_model.ckpt \ 
+  --train_batch_size=16 \
+  --predict_batch_size=4 \
+  --learning_rate=2e-5 \
+  --log_steps=1 \
+  --num_gpus=4 \
+  --distribution_strategy=mirrored \
+  --model_dir=/public/home/hepj/outdir/tf2/squad4 \
+  --run_eagerly=False
+```
+
+#### 4.2.4.     运行
+
+```
+sh bert_squad4.sh
+```
+
+
+
--- a/README_old.md
+++ b/README_old.md
+# BERT (Bidirectional Encoder Representations from Transformers)
+
+The academic paper which describes BERT in detail and provides full results on a
+number of tasks can be found here: https://arxiv.org/abs/1810.04805.
+
+This repository contains TensorFlow 2.x implementation for BERT.
+
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+
+
+## Pre-trained Models
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
+released in TF 1.x official BERT repository
+[google-research/bert](https://github.com/google-research/bert)
+in order to keep consistent with BERT paper.
+
+
+### Access to Pretrained Checkpoints
+
+Pretrained checkpoints can be found in the following links:
+
+**Note: We have switched BERT implementation
+to use Keras functional-style networks in [nlp/modeling](../modeling).
+The new checkpoints are:**
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+
+### Restoring from Checkpoints
+
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+
+### Access to Pretrained hub modules.
+
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/1)**:
+    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
+    110M parameters
+
+## Set Up
+
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+
+Install `tf-nightly` to get latest updates:
+
+```shell
+pip install tf-nightly-gpu
+```
+
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+
+Second, you need to install TF 2 `tf-nightly` on your VM:
+
+```shell
+pip install tf-nightly
+```
+
+## Process Datasets
+
+### Pre-training
+
+There is no change to generate pre-training data. Please use the script
+[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
+which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
+to get processed pre-training data and it adapts to TF2 symbols and python3
+compatibility.
+
+
+### Fine-tuning
+
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+
+* GLUE
+
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
+
+```shell
+export GLUE_DIR=~/glue
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
+```
+
+* SQUAD
+
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+
+The necessary files can be found here:
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export OUTPUT_DIR=gs://some_bucket/datasets
+
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384
+```
+
+## Fine-tuning with BERT
+
+### Cloud GPUs and TPUs
+
+* Cloud Storage
+
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+
+* GPU -> TPU
+
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### Sentence and Sentence-pair Classification Tasks
+
+This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
+few minutes on most GPUs.
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
+
+After training a model, to get predictions from the classifier, you can set the
+`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
+Output will be created in file called test_results.tsv in the output folder.
+Each line will contain output for each sample, columns are the class
+probabilities.
+
+```shell
+python run_classifier.py \
+  --mode='predict' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --eval_batch_size=4 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
+training steps inside a `tf.function` can significantly increase TPU utilization
+and callbacks will not be called inside the loop.
+
+### SQuAD 1.1
+
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
+specify a hub module path.
+
+`run_squad.py` writes the prediction for `--predict_file` by default. If you set
+the `--model=predict` and offer the SQuAD test data, the scripts will generate
+the prediction json file.
+
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
+
+
--- a/__init__.py
+++ b/__init__.py
+
--- a/assets/vocab.txt
+++ b/assets/vocab.txt
--- a/bert_cloud_tpu.md
+++ b/bert_cloud_tpu.md
+# BERT FineTuning with Cloud TPU: Sentence and Sentence-Pair Classification Tasks (TF 2.1)
+This tutorial shows you how to train the Bidirectional Encoder Representations from Transformers (BERT) model on Cloud TPU.
+
+
+## Set up Cloud Storage and Compute Engine VM
+1. [Open a cloud shell window](https://console.cloud.google.com/?cloudshell=true&_ga=2.11844148.-1612541229.1552429951)
+2. Create a variable for the project's name:
+```
+export PROJECT_NAME=your-project_name
+```
+3. Configure `gcloud` command-line tool to use the project where you want to create Cloud TPU.
+```
+gcloud config set project ${PROJECT_NAME}
+```
+4. Create a Cloud Storage bucket using the following command:
+```
+gsutil mb -p ${PROJECT_NAME} -c standard -l europe-west4 -b on gs://your-bucket-name
+```
+This Cloud Storage bucket stores the data you use to train your model and the training results.
+5. Launch a Compute Engine VM and Cloud TPU using the ctpu up command.
+```
+ctpu up --tpu-size=v3-8 \
+ --machine-type=n1-standard-8 \
+ --zone=europe-west4-a \
+ --tf-version=2.1 [optional flags: --project, --name]
+```
+6. The configuration you specified appears. Enter y to approve or n to cancel.
+7. When the ctpu up command has finished executing, verify that your shell prompt has changed from username@project to username@tpuname. This change shows that you are now logged into your Compute Engine VM.
+```
+gcloud compute ssh vm-name --zone=europe-west4-a
+(vm)$ export TPU_NAME=vm-name
+```
+As you continue these instructions, run each command that begins with `(vm)$` in your VM session window.
+
+## Prepare the Dataset
+1. From your Compute Engine virtual machine (VM), install requirements.txt.
+```
+(vm)$ cd /usr/share/models
+(vm)$ sudo pip3 install -r official/requirements.txt
+```
+2. Optional: download download_glue_data.py
+
+This tutorial uses the General Language Understanding Evaluation (GLUE) benchmark to evaluate and analyze the performance of the model. The GLUE data is provided for this tutorial at gs://cloud-tpu-checkpoints/bert/classification.
+
+## Define parameter values
+Next, define several parameter values that are required when you train and evaluate your model:
+
+```
+(vm)$ export PYTHONPATH="$PYTHONPATH:/usr/share/tpu/models"
+(vm)$ export STORAGE_BUCKET=gs://your-bucket-name
+(vm)$ export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+(vm)$ export MODEL_DIR=${STORAGE_BUCKET}/bert-output
+(vm)$ export GLUE_DIR=gs://cloud-tpu-checkpoints/bert/classification
+(vm)$ export TASK=mnli
+```
+
+## Train the model
+From your Compute Engine VM, run the following command.
+
+```
+(vm)$ python3 official/nlp/bert/run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=${TPU_NAME}
+```
+
+## Verify your results
+The training takes approximately 1 hour on a v3-8 TPU. When script completes, you should see results similar to the following:
+```
+Training Summary:
+{'train_loss': 0.28142181038856506,
+'last_train_metrics': 0.9467429518699646,
+'eval_metrics': 0.8599063158035278,
+'total_training_steps': 36813}
+```
+
+## Clean up
+To avoid incurring charges to your GCP account for the resources used in this topic:
+1. Disconnect from the Compute Engine VM:
+```
+(vm)$ exit
+```
+2. In your Cloud Shell, run ctpu delete with the --zone flag you used when you set up the Cloud TPU to delete your Compute Engine VM and your Cloud TPU:
+```
+$ ctpu delete --zone=your-zone
+```
+3. Run ctpu status specifying your zone to make sure you have no instances allocated to avoid unnecessary charges for TPU usage. The deletion might take several minutes. A response like the one below indicates there are no more allocated instances:
+```
+$ ctpu status --zone=your-zone
+```
+4. Run gsutil as shown, replacing your-bucket with the name of the Cloud Storage bucket you created for this tutorial:
+```
+$ gsutil rm -r gs://your-bucket
+```
+
+
+
+
+
+
--- a/bert_models.py
+++ b/bert_models.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT models that are compatible with TF 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gin
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.modeling import tf_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import configs
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+
+
+class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
+  """Returns layer that computes custom loss and metrics for pretraining."""
+
+  def __init__(self, vocab_size, **kwargs):
+    super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self.config = {
+        'vocab_size': vocab_size,
+    }
+
+  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
+                   lm_example_loss, sentence_output, sentence_labels,
+                   next_sentence_loss):
+    """Adds metrics."""
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        lm_labels, lm_output)
+    numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
+    denominator = tf.reduce_sum(lm_label_weights) + 1e-5
+    masked_lm_accuracy = numerator / denominator
+    self.add_metric(
+        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+
+    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
+
+    if sentence_labels is not None:
+      next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+          sentence_labels, sentence_output)
+      self.add_metric(
+          next_sentence_accuracy,
+          name='next_sentence_accuracy',
+          aggregation='mean')
+
+    if next_sentence_loss is not None:
+      self.add_metric(
+          next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+
+  def call(self,
+           lm_output_logits,
+           sentence_output_logits,
+           lm_label_ids,
+           lm_label_weights,
+           sentence_labels=None):
+    """Implements call() for the layer."""
+    lm_label_weights = tf.cast(lm_label_weights, tf.float32)
+    lm_output_logits = tf.cast(lm_output_logits, tf.float32)
+
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        lm_label_ids, lm_output_logits, from_logits=True)
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mask_label_loss = tf.math.divide_no_nan(lm_numerator_loss,
+                                            lm_denominator_loss)
+
+    if sentence_labels is not None:
+      sentence_output_logits = tf.cast(sentence_output_logits, tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels, sentence_output_logits, from_logits=True)
+      sentence_loss = tf.reduce_mean(sentence_loss)
+      loss = mask_label_loss + sentence_loss
+    else:
+      sentence_loss = None
+      loss = mask_label_loss
+
+    batch_shape = tf.slice(tf.shape(lm_label_ids), [0], [1])
+    # TODO(hongkuny): Avoids the hack and switches add_loss.
+    final_loss = tf.fill(batch_shape, loss)
+
+    self._add_metrics(lm_output_logits, lm_label_ids, lm_label_weights,
+                      mask_label_loss, sentence_output_logits, sentence_labels,
+                      sentence_loss)
+    return final_loss
+
+
+@gin.configurable
+def get_transformer_encoder(bert_config,
+                            sequence_length,
+                            transformer_encoder_cls=None,
+                            output_range=None):
+  """Gets a 'TransformerEncoder' object.
+
+  Args:
+    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
+    sequence_length: Maximum sequence length of the training data.
+    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
+      default BERT encoder implementation.
+    output_range: the sequence output range, [0, output_range). Default setting
+      is to return the entire sequence output.
+
+  Returns:
+    A networks.TransformerEncoder object.
+  """
+  if transformer_encoder_cls is not None:
+    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
+    embedding_cfg = dict(
+        vocab_size=bert_config.vocab_size,
+        type_vocab_size=bert_config.type_vocab_size,
+        hidden_size=bert_config.hidden_size,
+        seq_length=sequence_length,
+        max_seq_length=bert_config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+        dropout_rate=bert_config.hidden_dropout_prob,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=bert_config.num_attention_heads,
+        intermediate_size=bert_config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
+        dropout_rate=bert_config.hidden_dropout_prob,
+        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=bert_config.num_hidden_layers,
+        pooled_output_dim=bert_config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range))
+
+    # Relies on gin configuration to define the Transformer encoder arguments.
+    return transformer_encoder_cls(**kwargs)
+
+  kwargs = dict(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      sequence_length=sequence_length,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      embedding_width=bert_config.embedding_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range))
+  if isinstance(bert_config, albert_configs.AlbertConfig):
+    return networks.AlbertTransformerEncoder(**kwargs)
+  else:
+    assert isinstance(bert_config, configs.BertConfig)
+    kwargs['output_range'] = output_range
+    return networks.TransformerEncoder(**kwargs)
+
+
+def pretrain_model(bert_config,
+                   seq_length,
+                   max_predictions_per_seq,
+                   initializer=None,
+                   use_next_sentence_label=True,
+                   return_core_pretrainer_model=False):
+  """Returns model to be used for pre-training.
+
+  Args:
+      bert_config: Configuration that defines the core BERT model.
+      seq_length: Maximum sequence length of the training data.
+      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
+        and use for pretraining.
+      initializer: Initializer for weights in BertPretrainer.
+      use_next_sentence_label: Whether to use the next sentence label.
+      return_core_pretrainer_model: Whether to also return the `BertPretrainer`
+        object.
+
+  Returns:
+      A Tuple of (1) Pretraining model, (2) core BERT submodel from which to
+      save weights after pretraining, and (3) optional core `BertPretrainer`
+      object if argument `return_core_pretrainer_model` is True.
+  """
+  input_word_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_mask', dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  masked_lm_positions = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_positions',
+      dtype=tf.int32)
+  masked_lm_ids = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_weights',
+      dtype=tf.int32)
+
+  if use_next_sentence_label:
+    next_sentence_labels = tf.keras.layers.Input(
+        shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+  else:
+    next_sentence_labels = None
+
+  transformer_encoder = get_transformer_encoder(bert_config, seq_length)
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  pretrainer_model = models.BertPretrainer(
+      network=transformer_encoder,
+      embedding_table=transformer_encoder.get_embedding_table(),
+      num_classes=2,  # The next sentence prediction label has two classes.
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      num_token_predictions=max_predictions_per_seq,
+      initializer=initializer,
+      output='logits')
+
+  outputs = pretrainer_model(
+      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+  lm_output = outputs['masked_lm']
+  sentence_output = outputs['classification']
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
+      vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
+                                    masked_lm_weights, next_sentence_labels)
+  inputs = {
+      'input_word_ids': input_word_ids,
+      'input_mask': input_mask,
+      'input_type_ids': input_type_ids,
+      'masked_lm_positions': masked_lm_positions,
+      'masked_lm_ids': masked_lm_ids,
+      'masked_lm_weights': masked_lm_weights,
+  }
+  if use_next_sentence_label:
+    inputs['next_sentence_labels'] = next_sentence_labels
+
+  keras_model = tf.keras.Model(inputs=inputs, outputs=output_loss)
+  if return_core_pretrainer_model:
+    return keras_model, transformer_encoder, pretrainer_model
+  else:
+    return keras_model, transformer_encoder
+
+
+def squad_model(bert_config,
+                max_seq_length,
+                initializer=None,
+                hub_module_url=None,
+                hub_module_trainable=True):
+  """Returns BERT Squad model along with core BERT model to import weights.
+
+  Args:
+    bert_config: BertConfig, the config defines the core Bert model.
+    max_seq_length: integer, the maximum input sequence length.
+    initializer: Initializer for the final dense layer in the span labeler.
+      Defaulted to TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    A tuple of (1) keras model that outputs start logits and end logits and
+    (2) the core BERT transformer encoder.
+  """
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
+    return models.BertSpanLabeler(
+        network=bert_encoder, initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  core_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, sequence_output = core_model(
+      [input_word_ids, input_mask, input_type_ids])
+  bert_encoder = tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids,
+      },
+      outputs=[sequence_output, pooled_output],
+      name='core_model')
+  return models.BertSpanLabeler(
+      network=bert_encoder, initializer=initializer), bert_encoder
+
+
+def classifier_model(bert_config,
+                     num_labels,
+                     max_seq_length=None,
+                     final_layer_initializer=None,
+                     hub_module_url=None,
+                     hub_module_trainable=True):
+  """BERT classifier model in functional API style.
+
+  Construct a Keras model for predicting `num_labels` outputs from an input with
+  maximum sequence length `max_seq_length`.
+
+  Args:
+    bert_config: BertConfig or AlbertConfig, the config defines the core BERT or
+      ALBERT model.
+    num_labels: integer, the number of classes.
+    max_seq_length: integer, the maximum input sequence length.
+    final_layer_initializer: Initializer for final dense layer. Defaulted
+      TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    Combined prediction model (words, mask, type) -> (one-hot labels)
+    BERT sub-model (words, mask, type) -> (bert_outputs)
+  """
+  if final_layer_initializer is not None:
+    initializer = final_layer_initializer
+  else:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(
+        bert_config, max_seq_length, output_range=1)
+    return models.BertClassifier(
+        bert_encoder,
+        num_classes=num_labels,
+        dropout_rate=bert_config.hidden_dropout_prob,
+        initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
+      pooled_output)
+
+  output = tf.keras.layers.Dense(
+      num_labels, kernel_initializer=initializer, name='output')(
+          output)
+  return tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids
+      },
+      outputs=output), bert_model
--- a/bert_models_test.py
+++ b/bert_models_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.nlp.bert import bert_models
+from official.nlp.bert import configs as bert_configs
+from official.nlp.modeling import networks
+
+
+class BertModelsTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BertModelsTest, self).setUp()
+    self._bert_test_config = bert_configs.BertConfig(
+        attention_probs_dropout_prob=0.0,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.0,
+        hidden_size=16,
+        initializer_range=0.02,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=2,
+        type_vocab_size=2,
+        vocab_size=30522)
+
+  def test_pretrain_model(self):
+    model, encoder = bert_models.pretrain_model(
+        self._bert_test_config,
+        seq_length=5,
+        max_predictions_per_seq=2,
+        initializer=None,
+        use_next_sentence_label=True)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(encoder, networks.TransformerEncoder)
+
+    # model has one scalar output: loss value.
+    self.assertEqual(model.output.shape.as_list(), [None,])
+
+    # Expect two output from encoder: sequence and classification output.
+    self.assertIsInstance(encoder.output, list)
+    self.assertLen(encoder.output, 2)
+    # shape should be [batch size, seq_length, hidden_size]
+    self.assertEqual(encoder.output[0].shape.as_list(), [None, 5, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(encoder.output[1].shape.as_list(), [None, 16])
+
+  def test_squad_model(self):
+    model, core_model = bert_models.squad_model(
+        self._bert_test_config,
+        max_seq_length=5,
+        initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+
+    # Expect two output from model: start positions and end positions
+    self.assertIsInstance(model.output, list)
+    self.assertLen(model.output, 2)
+    # shape should be [batch size, seq_length]
+    self.assertEqual(model.output[0].shape.as_list(), [None, 5])
+    # shape should be [batch size, seq_length]
+    self.assertEqual(model.output[1].shape.as_list(), [None, 5])
+
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, seq_length, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, 5, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+
+  def test_classifier_model(self):
+    model, core_model = bert_models.classifier_model(
+        self._bert_test_config,
+        num_labels=3,
+        max_seq_length=5,
+        final_layer_initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+
+    # model has one classification output with num_labels=3.
+    self.assertEqual(model.output.shape.as_list(), [None, 3])
+
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, 1, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, 1, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/common_flags.py
+++ b/common_flags.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defining common flags used across all BERT models/applications."""
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils import hyperparams_flags
+from official.utils.flags import core as flags_core
+
+
+def define_common_bert_flags():
+  """Define common flags for BERT tasks."""
+  flags_core.define_base(
+      data_dir=False,
+      model_dir=True,
+      clean=False,
+      train_epochs=False,
+      epochs_between_evals=False,
+      stop_threshold=False,
+      batch_size=False,
+      num_gpu=True,
+      export_dir=False,
+      distribution_strategy=True,
+      run_eagerly=True)
+  flags_core.define_distribution()
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string(
+      'model_export_path', None,
+      'Path to the directory, where trainined model will be '
+      'exported.')
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_string(
+      'init_checkpoint', None,
+      'Initial checkpoint (usually from a pre-trained BERT model).')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', None,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside. If not set the value will be configured depending on the '
+      'devices available.')
+  flags.DEFINE_float('learning_rate', 5e-5,
+                     'The initial learning rate for Adam.')
+  flags.DEFINE_float('end_lr', 0.0,
+                     'The end learning rate for learning rate decay.')
+  flags.DEFINE_string('optimizer_type', 'adamw',
+                      'The type of optimizer to use for training (adamw|lamb)')
+  flags.DEFINE_boolean(
+      'scale_loss', False,
+      'Whether to divide the loss by number of replica inside the per-replica '
+      'loss function.')
+  flags.DEFINE_boolean(
+      'use_keras_compile_fit', False,
+      'If True, uses Keras compile/fit() API for training logic. Otherwise '
+      'use custom training loop.')
+  flags.DEFINE_string(
+      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
+      'If specified, init_checkpoint flag should not be used.')
+  flags.DEFINE_bool('hub_module_trainable', True,
+                    'True to make keras layers in the hub module trainable.')
+  flags.DEFINE_string('sub_model_export_name', None,
+                      'If set, `sub_model` checkpoints are exported into '
+                      'FLAGS.model_dir/FLAGS.sub_model_export_name.')
+
+  flags_core.define_log_steps()
+
+  # Adds flags for mixed precision and multi-worker training.
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=False,
+      max_train_steps=False,
+      dtype=True,
+      dynamic_loss_scale=True,
+      loss_scale=True,
+      all_reduce_alg=True,
+      num_packs=False,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      enable_xla=True,
+      fp16_implementation=True,
+  )
+
+  # Adds gin configuration flags.
+  hyperparams_flags.define_gin_flags()
+
+
+def dtype():
+  return flags_core.get_tf_dtype(flags.FLAGS)
+
+
+def use_float16():
+  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
+
+
+def use_graph_rewrite():
+  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
+
+
+def get_loss_scale():
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
--- a/configs.py
+++ b/configs.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The main BERT model and related functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import six
+import tensorflow as tf
+
+
+class BertConfig(object):
+  """Configuration for `BertModel`."""
+
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02,
+               embedding_size=None,
+               backward_compatible=True):
+    """Constructs BertConfig.
+
+    Args:
+      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+      hidden_size: Size of the encoder layers and the pooler layer.
+      num_hidden_layers: Number of hidden layers in the Transformer encoder.
+      num_attention_heads: Number of attention heads for each attention layer in
+        the Transformer encoder.
+      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        layer in the Transformer encoder.
+      hidden_act: The non-linear activation function (function or string) in the
+        encoder and pooler.
+      hidden_dropout_prob: The dropout probability for all fully connected
+        layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob: The dropout ratio for the attention
+        probabilities.
+      max_position_embeddings: The maximum sequence length that this model might
+        ever be used with. Typically set this to something large just in case
+        (e.g., 512 or 1024 or 2048).
+      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+        `BertModel`.
+      initializer_range: The stdev of the truncated_normal_initializer for
+        initializing all weight matrices.
+      embedding_size: (Optional) width of the factorized word embeddings.
+      backward_compatible: Boolean, whether the variables shape are compatible
+        with checkpoints converted from TF 1.x BERT.
+    """
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.embedding_size = embedding_size
+    self.backward_compatible = backward_compatible
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    config = BertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
+
+  @classmethod
+  def from_json_file(cls, json_file):
+    """Constructs a `BertConfig` from a json file of parameters."""
+    with tf.io.gfile.GFile(json_file, "r") as reader:
+      text = reader.read()
+    return cls.from_dict(json.loads(text))
+
+  def to_dict(self):
+    """Serializes this instance to a Python dictionary."""
+    output = copy.deepcopy(self.__dict__)
+    return output
+
+  def to_json_string(self):
+    """Serializes this instance to a JSON string."""
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
--- a/dcu1_sauad
+++ b/dcu1_sauad
--- a/dcu1_squad.log
+++ b/dcu1_squad.log
--- a/download_glue_data.py
+++ b/download_glue_data.py
+''' Script for downloading all GLUE data.
+
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+
+import os
+import sys
+import shutil
+import argparse
+import tempfile
+import urllib.request
+import zipfile
+
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+    try:
+        URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
+
+    dev_ids = []
+    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+
+    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+                
+    print("\tCompleted!")
+    
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
\ No newline at end of file
--- a/export_tfhub.py
+++ b/export_tfhub.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A script to export the BERT core model as a TF-Hub SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from typing import Text
+from official.nlp.bert import bert_models
+from official.nlp.bert import configs
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string("model_checkpoint_path", None,
+                    "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_bool("do_lower_case", None, "Whether to lowercase. If None, "
+                  "do_lower_case will be enabled if 'uncased' appears in the "
+                  "name of --vocab_file")
+
+
+def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:
+  """Creates a BERT keras core model from BERT configuration.
+
+  Args:
+    bert_config: A `BertConfig` to create the core model.
+
+  Returns:
+    A keras model.
+  """
+  # Adds input layers just as placeholders.
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_word_ids")
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_mask")
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_type_ids")
+  transformer_encoder = bert_models.get_transformer_encoder(
+      bert_config, sequence_length=None)
+  sequence_output, pooled_output = transformer_encoder(
+      [input_word_ids, input_mask, input_type_ids])
+  # To keep consistent with legacy hub modules, the outputs are
+  # "pooled_output" and "sequence_output".
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[pooled_output, sequence_output]), transformer_encoder
+
+
+def export_bert_tfhub(bert_config: configs.BertConfig,
+                      model_checkpoint_path: Text, hub_destination: Text,
+                      vocab_file: Text, do_lower_case: bool = None):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  # If do_lower_case is not explicit, default to checking whether "uncased" is
+  # in the vocab file name
+  if do_lower_case is None:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+  core_model, encoder = create_bert_model(bert_config)
+  checkpoint = tf.train.Checkpoint(model=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_consumed()
+  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
+
+
+def main(_):
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path, FLAGS.export_path,
+                    FLAGS.vocab_file, FLAGS.do_lower_case)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/export_tfhub_test.py
+++ b/export_tfhub_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests official.nlp.bert.export_tfhub."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+
+
+class ExportTfhubTest(tf.test.TestCase):
+
+  def test_export_tfhub(self):
+    # Exports a savedmodel for TF-Hub
+    hidden_size = 16
+    bert_config = configs.BertConfig(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    bert_model, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+
+    # Restores a hub KerasLayer.
+    hub_layer = hub.KerasLayer(hub_destination, trainable=True)
+
+    if hasattr(hub_layer, "resolved_object"):
+      # Checks meta attributes.
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      with tf.io.gfile.GFile(
+          hub_layer.resolved_object.vocab_file.asset_path.numpy()) as f:
+        self.assertEqual("dummy content", f.read())
+    # Checks the hub KerasLayer.
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
+
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids])
+    source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids])
+
+    # The outputs of hub module are "pooled_output" and "sequence_output",
+    # while the outputs of encoder is in reversed order, i.e.,
+    # "sequence_output" and "pooled_output".
+    encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids]))
+    self.assertEqual(hub_outputs[0].shape, (2, hidden_size))
+    self.assertEqual(hub_outputs[1].shape, (2, seq_length, hidden_size))
+    for source_output, hub_output, encoder_output in zip(
+        source_outputs, hub_outputs, encoder_outputs):
+      self.assertAllClose(source_output.numpy(), hub_output.numpy())
+      self.assertAllClose(source_output.numpy(), encoder_output.numpy())
+
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      inputs = [input_ids, np.ones_like(input_ids), np.zeros_like(input_ids)]
+      outputs = np.concatenate(
+          [hub_layer(inputs, training=training)[0] for _ in range(num_runs)])
+      return np.mean(np.std(outputs, axis=0))
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    pooled_output, sequence_output = hub_layer(
+        [input_word_ids, input_mask, input_type_ids])
+    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
+    self.assertEqual(sequence_output.shape.as_list(),
+                     [None, seq_length, hidden_size])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/input_pipeline.py
+++ b/input_pipeline.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.io.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.cast(t, tf.int32)
+    example[name] = t
+
+  return example
+
+
+def single_file_dataset(input_file, name_to_features):
+  """Creates a single-file dataset to be passed for BERT custom training."""
+  # For training, we want a lot of parallel reading and shuffling.
+  # For eval, we want no shuffling and parallel reading doesn't matter.
+  d = tf.data.TFRecordDataset(input_file)
+  d = d.map(
+      lambda record: decode_record(record, name_to_features),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  # When `input_file` is a path to a single file or a list
+  # containing a single path, disable auto sharding so that
+  # same input file is sent to all workers.
+  if isinstance(input_file, str) or len(input_file) == 1:
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = (
+        tf.data.experimental.AutoShardPolicy.OFF)
+    d = d.with_options(options)
+  return d
+
+
+def create_pretrain_dataset(input_patterns,
+                            seq_length,
+                            max_predictions_per_seq,
+                            batch_size,
+                            is_training=True,
+                            input_pipeline_context=None,
+                            use_next_sentence_label=True,
+                            use_position_id=False,
+                            output_fake_labels=True):
+  """Creates input dataset from (tf)records files for pretraining."""
+  name_to_features = {
+      'input_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'masked_lm_positions':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_ids':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_weights':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+  }
+  if use_next_sentence_label:
+    name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                     tf.int64)
+  if use_position_id:
+    name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length],
+                                                             tf.int64)
+  for input_pattern in input_patterns:
+    if not tf.io.gfile.glob(input_pattern):
+      raise ValueError('%s does not match any files.' % input_pattern)
+
+  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
+
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset = dataset.repeat()
+
+    # We set shuffle buffer to exactly match total number of
+    # training files to ensure that training data is well shuffled.
+    input_files = []
+    for input_pattern in input_patterns:
+      input_files.extend(tf.io.gfile.glob(input_pattern))
+    dataset = dataset.shuffle(len(input_files))
+
+  # In parallel, create tf record dataset for each train files.
+  # cycle_length = 8 means that up to 8 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=8,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+
+  decode_fn = lambda record: decode_record(record, name_to_features)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  def _select_data_from_record(record):
+    """Filter out features to use for pretraining."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids'],
+        'masked_lm_positions': record['masked_lm_positions'],
+        'masked_lm_ids': record['masked_lm_ids'],
+        'masked_lm_weights': record['masked_lm_weights'],
+    }
+    if use_next_sentence_label:
+      x['next_sentence_labels'] = record['next_sentence_labels']
+    if use_position_id:
+      x['position_ids'] = record['position_ids']
+
+    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
+    if output_fake_labels:
+      return (x, record['masked_lm_weights'])
+    else:
+      return x
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_classifier_dataset(file_path,
+                              seq_length,
+                              batch_size,
+                              is_training=True,
+                              input_pipeline_context=None,
+                              label_type=tf.int64,
+                              include_sample_weights=False):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'label_ids': tf.io.FixedLenFeature([], label_type),
+  }
+  if include_sample_weights:
+    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
+  dataset = single_file_dataset(file_path, name_to_features)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    if include_sample_weights:
+      w = record['weight']
+      return (x, y, w)
+    return (x, y)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_squad_dataset(file_path,
+                         seq_length,
+                         batch_size,
+                         is_training=True,
+                         input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+  }
+  if is_training:
+    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+  else:
+    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+
+  dataset = single_file_dataset(file_path, name_to_features)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    """Dispatches record to features and labels."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=True)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_retrieval_dataset(file_path,
+                             seq_length,
+                             batch_size,
+                             input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for scoring."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'int_iden': tf.io.FixedLenFeature([1], tf.int64),
+  }
+  dataset = single_file_dataset(file_path, name_to_features)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['int_iden']
+    return (x, y)
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=False)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
--- a/model_saving_utils.py
+++ b/model_saving_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to save models."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from absl import logging
+import tensorflow as tf
+import typing
+
+
+def export_bert_model(model_export_path: typing.Text,
+                      model: tf.keras.Model,
+                      checkpoint_dir: typing.Optional[typing.Text] = None,
+                      restore_model_using_load_weights: bool = False) -> None:
+  """Export BERT model for serving which does not include the optimizer.
+
+  Arguments:
+      model_export_path: Path to which exported model will be saved.
+      model: Keras model object to export.
+      checkpoint_dir: Path from which model weights will be loaded, if
+        specified.
+      restore_model_using_load_weights: Whether to use checkpoint.restore() API
+        for custom checkpoint or to use model.load_weights() API.
+        There are 2 different ways to save checkpoints. One is using
+        tf.train.Checkpoint and another is using Keras model.save_weights().
+        Custom training loop implementation uses tf.train.Checkpoint API
+        and Keras ModelCheckpoint callback internally uses model.save_weights()
+        API. Since these two API's cannot be used toghether, model loading logic
+        must be take into account how model checkpoint was saved.
+
+  Raises:
+    ValueError when either model_export_path or model is not specified.
+  """
+  if not model_export_path:
+    raise ValueError('model_export_path must be specified.')
+  if not isinstance(model, tf.keras.Model):
+    raise ValueError('model must be a tf.keras.Model object.')
+
+  if checkpoint_dir:
+    # Keras compile/fit() was used to save checkpoint using
+    # model.save_weights().
+    if restore_model_using_load_weights:
+      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
+      assert tf.io.gfile.exists(model_weight_path)
+      model.load_weights(model_weight_path)
+
+    # tf.train.Checkpoint API was used via custom training loop logic.
+    else:
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      # Restores the model from latest checkpoint.
+      latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+
+  model.save(model_export_path, include_optimizer=False, save_format='tf')
--- a/model_training_utils.py
+++ b/model_training_utils.py
--- a/model_training_utils_test.py
+++ b/model_training_utils_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.modeling.training.model_training_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import logging
+from absl.testing import parameterized
+from absl.testing.absltest import mock
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.nlp.bert import model_training_utils
+
+
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+def eager_gpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+def create_fake_data_input_fn(batch_size, features_shape, num_classes):
+  """Creates a dummy input function with the given feature and label shapes.
+
+  Args:
+    batch_size: integer.
+    features_shape: list[int]. Feature shape for an individual example.
+    num_classes: integer. Number of labels.
+
+  Returns:
+    An input function that is usable in the executor.
+  """
+
+  def _dataset_fn(input_context=None):
+    """An input function for generating fake data."""
+    local_batch_size = input_context.get_per_replica_batch_size(batch_size)
+    features = np.random.rand(64, *features_shape)
+    labels = np.random.randint(2, size=[64, num_classes])
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+
+    def _assign_dtype(features, labels):
+      features = tf.cast(features, tf.float32)
+      labels = tf.cast(labels, tf.float32)
+      return features, labels
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.map(_assign_dtype)
+    dataset = dataset.shuffle(64).repeat()
+    dataset = dataset.batch(local_batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(buffer_size=64)
+    return dataset
+
+  return _dataset_fn
+
+
+def create_model_fn(input_shape, num_classes, use_float16=False):
+
+  def _model_fn():
+    """A one-layer softmax model suitable for testing."""
+    input_layer = tf.keras.layers.Input(shape=input_shape)
+    x = tf.keras.layers.Dense(num_classes, activation='relu')(input_layer)
+    output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
+    sub_model = tf.keras.models.Model(input_layer, x, name='sub_model')
+    model = tf.keras.models.Model(input_layer, output_layer, name='model')
+    model.add_metric(
+        tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
+    model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
+    if use_float16:
+      model.optimizer = (
+          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+              model.optimizer, loss_scale='dynamic'))
+    return model, sub_model
+
+  return _model_fn
+
+
+def metric_fn():
+  """Gets a tf.keras metric object."""
+  return tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32)
+
+
+def summaries_with_matching_keyword(keyword, summary_dir):
+  """Yields summary protos matching given keyword from event file."""
+  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, 'events*'))
+  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
+        if keyword in value.tag:
+          logging.error(event)
+          yield event.summary
+
+
+def check_eventfile_for_keyword(keyword, summary_dir):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, summary_dir))
+
+
+class RecordingCallback(tf.keras.callbacks.Callback):
+
+  def __init__(self):
+    self.batch_begin = []  # (batch, logs)
+    self.batch_end = []    # (batch, logs)
+    self.epoch_begin = []  # (epoch, logs)
+    self.epoch_end = []    # (epoch, logs)
+
+  def on_batch_begin(self, batch, logs=None):
+    self.batch_begin.append((batch, logs))
+
+  def on_batch_end(self, batch, logs=None):
+    self.batch_end.append((batch, logs))
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_begin.append((epoch, logs))
+
+  def on_epoch_end(self, epoch, logs=None):
+    self.epoch_end.append((epoch, logs))
+
+
+class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(ModelTrainingUtilsTest, self).setUp()
+    self._model_fn = create_model_fn(input_shape=[128], num_classes=3)
+
+  def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
+    input_fn = create_fake_data_input_fn(
+        batch_size=8, features_shape=[128], num_classes=3)
+    model_training_utils.run_customized_training_loop(
+        strategy=strategy,
+        model_fn=self._model_fn,
+        loss_fn=tf.keras.losses.categorical_crossentropy,
+        model_dir=model_dir,
+        steps_per_epoch=20,
+        steps_per_loop=steps_per_loop,
+        epochs=2,
+        train_input_fn=input_fn,
+        eval_input_fn=input_fn,
+        eval_steps=10,
+        init_checkpoint=None,
+        sub_model_export_name='my_submodel_name',
+        metric_fn=metric_fn,
+        custom_callbacks=None,
+        run_eagerly=run_eagerly)
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_eager_single_step(self, distribution):
+    model_dir = self.get_temp_dir()
+    if isinstance(distribution, tf.distribute.experimental.TPUStrategy):
+      with self.assertRaises(ValueError):
+        self.run_training(
+            distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+    else:
+      self.run_training(
+          distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+
+  @combinations.generate(eager_gpu_strategy_combinations())
+  def test_train_eager_mixed_precision(self, distribution):
+    model_dir = self.get_temp_dir()
+    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    self._model_fn = create_model_fn(
+        input_shape=[128], num_classes=3, use_float16=True)
+    self.run_training(
+        distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_check_artifacts(self, distribution):
+    model_dir = self.get_temp_dir()
+    self.run_training(
+        distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+
+    # Two checkpoints should be saved after two epochs.
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*index')))
+    self.assertCountEqual(['ctl_step_20.ckpt-1.index',
+                           'ctl_step_40.ckpt-2.index'], files)
+
+    # Three submodel checkpoints should be saved after two epochs (one after
+    # each epoch plus one final).
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir,
+                                              'my_submodel_name*index')))
+    self.assertCountEqual(['my_submodel_name.ckpt-3.index',
+                           'my_submodel_name_step_20.ckpt-1.index',
+                           'my_submodel_name_step_40.ckpt-2.index'], files)
+
+    self.assertNotEmpty(
+        tf.io.gfile.glob(
+            os.path.join(model_dir, 'summaries/training_summary*')))
+
+    # Loss and accuracy values should be written into summaries.
+    self.assertTrue(
+        check_eventfile_for_keyword('loss',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/eval')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/eval')))
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_check_callbacks(self, distribution):
+    model_dir = self.get_temp_dir()
+    callback = RecordingCallback()
+    callbacks = [callback]
+    input_fn = create_fake_data_input_fn(
+        batch_size=8, features_shape=[128], num_classes=3)
+    model_training_utils.run_customized_training_loop(
+        strategy=distribution,
+        model_fn=self._model_fn,
+        loss_fn=tf.keras.losses.categorical_crossentropy,
+        model_dir=model_dir,
+        steps_per_epoch=20,
+        num_eval_per_epoch=4,
+        steps_per_loop=10,
+        epochs=2,
+        train_input_fn=input_fn,
+        eval_input_fn=input_fn,
+        eval_steps=10,
+        init_checkpoint=None,
+        metric_fn=metric_fn,
+        custom_callbacks=callbacks,
+        run_eagerly=False)
+    self.assertEqual(callback.epoch_begin, [(1, {}), (2, {})])
+    epoch_ends, epoch_end_infos = zip(*callback.epoch_end)
+    self.assertEqual(list(epoch_ends), [1, 2, 2])
+    for info in epoch_end_infos:
+      self.assertIn('accuracy', info)
+
+    self.assertEqual(callback.batch_begin, [(0, {}), (5, {}), (10, {}),
+                                            (15, {}), (20, {}), (25, {}),
+                                            (30, {}), (35, {})])
+    batch_ends, batch_end_infos = zip(*callback.batch_end)
+    self.assertEqual(list(batch_ends), [4, 9, 14, 19, 24, 29, 34, 39])
+    for info in batch_end_infos:
+      self.assertIn('loss', info)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+      ))
+  def test_train_check_artifacts_non_chief(self, distribution):
+    # We shouldn't export artifacts on non-chief workers. Since there's no easy
+    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
+    # to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
+    extended = distribution.extended
+    with mock.patch.object(extended.__class__, 'should_checkpoint',
+                           new_callable=mock.PropertyMock, return_value=False), \
+         mock.patch.object(extended.__class__, 'should_save_summary',
+                           new_callable=mock.PropertyMock, return_value=False):
+      model_dir = self.get_temp_dir()
+      self.run_training(
+          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+      self.assertEmpty(tf.io.gfile.listdir(model_dir))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/run.sh
+++ b/run.sh
+#export GLUE_DIR=./glue
+#export BERT_DIR=./bert_en_cased_L-12_H-768_A-12_1/assets
+#export TASK_NAME=MNLI
+#export OUTPUT_DIR=./glue_finetuning
+#python ../data/create_finetuning_data.py \
+# --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+# --vocab_file=${BERT_DIR}/vocab.txt \
+# --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+# --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+# --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+# --fine_tuning_task_type=classification --max_seq_length=128 \
+# --classification_task_name=${TASK_NAME}
+#
+
+export BERT_DIR=./bert_en_cased_L-12_H-768_A-12_1/assets
+export MODEL_DIR=./model_dir
+export GLUE_DIR=./glue_finetuning
+export TASK=MNLI
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --num_gpus=4 \
+  --distribution_strategy=multi_worker_mirrored
+#  --dtype=fp16 \
+#  --fp16_implementation=graph_rewrite \
+#  --distribution_strategy=mirrored /MultiWorkerMirroredStrategy
+  #--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  #--enable_xla=true \
--- a/run_classifier.py
+++ b/run_classifier.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT classification or regression finetuning runner in TF 2.x."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import json
+import math
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+import tensorflow as tf
+import sys
+sys.path.append("/public/home/xuanbaby/DL-TensorFlow/models_r2.3.0")
+from official.modeling import performance
+from official.nlp import optimization
+from official.nlp.bert import bert_models
+from official.nlp.bert import common_flags
+from official.nlp.bert import configs as bert_configs
+from official.nlp.bert import input_pipeline
+from official.nlp.bert import model_saving_utils
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+
+flags.DEFINE_enum(
+    'mode', 'train_and_eval', ['train_and_eval', 'export_only', 'predict'],
+    'One of {"train_and_eval", "export_only", "predict"}. `train_and_eval`: '
+    'trains the model and evaluates in the meantime. '
+    '`export_only`: will take the latest checkpoint inside '
+    'model_dir and export a `SavedModel`. `predict`: takes a checkpoint and '
+    'restores the model to output predictions on the test set.')
+flags.DEFINE_string('train_data_path', None,
+                    'Path to training data for BERT classifier.')
+flags.DEFINE_string('eval_data_path', None,
+                    'Path to evaluation data for BERT classifier.')
+flags.DEFINE_string(
+    'input_meta_data_path', None,
+    'Path to file that contains meta data about input '
+    'to be used for training and evaluation.')
+flags.DEFINE_string('predict_checkpoint_path', None,
+                    'Path to the checkpoint for predictions.')
+flags.DEFINE_integer(
+    'num_eval_per_epoch', 1,
+    'Number of evaluations per epoch. The purpose of this flag is to provide '
+    'more granular evaluation scores and checkpoints. For example, if original '
+    'data has N samples and num_eval_per_epoch is n, then each epoch will be '
+    'evaluated every N/n samples.')
+flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
+flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
+
+common_flags.define_common_bert_flags()
+
+FLAGS = flags.FLAGS
+
+LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
+
+
+def get_loss_fn(num_classes):
+  """Gets the classification loss function."""
+
+  def classification_loss_fn(labels, logits):
+    """Classification loss."""
+    labels = tf.squeeze(labels)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    one_hot_labels = tf.one_hot(
+        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(
+        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
+    return tf.reduce_mean(per_example_loss)
+
+  return classification_loss_fn
+
+
+def get_dataset_fn(input_file_pattern,
+                   max_seq_length,
+                   global_batch_size,
+                   is_training,
+                   label_type=tf.int64,
+                   include_sample_weights=False):
+  """Gets a closure to create a dataset."""
+
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_classifier_dataset(
+        tf.io.gfile.glob(input_file_pattern),
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx,
+        label_type=label_type,
+        include_sample_weights=include_sample_weights)
+    return dataset
+
+  return _dataset_fn
+
+
+def run_bert_classifier(strategy,
+                        bert_config,
+                        input_meta_data,
+                        model_dir,
+                        epochs,
+                        steps_per_epoch,
+                        steps_per_loop,
+                        eval_steps,
+                        warmup_steps,
+                        initial_lr,
+                        init_checkpoint,
+                        train_input_fn,
+                        eval_input_fn,
+                        training_callbacks=True,
+                        custom_callbacks=None,
+                        custom_metrics=None):
+  """Run BERT classifier training using low-level API."""
+  max_seq_length = input_meta_data['max_seq_length']
+  num_classes = input_meta_data.get('num_labels', 1)
+  is_regression = num_classes == 1
+
+  def _get_classifier_model():
+    """Gets a classifier model."""
+    classifier_model, core_model = (
+        bert_models.classifier_model(
+            bert_config,
+            num_classes,
+            max_seq_length,
+            hub_module_url=FLAGS.hub_module_url,
+            hub_module_trainable=FLAGS.hub_module_trainable))
+    optimizer = optimization.create_optimizer(initial_lr,
+                                              steps_per_epoch * epochs,
+                                              warmup_steps, FLAGS.end_lr,
+                                              FLAGS.optimizer_type)
+    classifier_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
+    return classifier_model, core_model
+
+  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming
+  # from the dataset) to compute weighted loss, as used for the regression
+  # tasks. The classification tasks, using the custom get_loss_fn don't accept
+  # sample weights though.
+  loss_fn = (tf.keras.losses.MeanSquaredError() if is_regression
+             else get_loss_fn(num_classes))
+
+  # Defines evaluation metrics function, which will create metrics in the
+  # correct device and strategy scope.
+  if custom_metrics:
+    metric_fn = custom_metrics
+  elif is_regression:
+    metric_fn = functools.partial(
+        tf.keras.metrics.MeanSquaredError,
+        'mean_squared_error',
+        dtype=tf.float32)
+  else:
+    metric_fn = functools.partial(
+        tf.keras.metrics.SparseCategoricalAccuracy,
+        'accuracy',
+        dtype=tf.float32)
+
+  # Start training using Keras compile/fit API.
+  logging.info('Training using TF 2.x Keras compile/fit API with '
+               'distribution strategy.')
+  return run_keras_compile_fit(
+      model_dir,
+      strategy,
+      _get_classifier_model,
+      train_input_fn,
+      eval_input_fn,
+      loss_fn,
+      metric_fn,
+      init_checkpoint,
+      epochs,
+      steps_per_epoch,
+      steps_per_loop,
+      eval_steps,
+      training_callbacks=training_callbacks,
+      custom_callbacks=custom_callbacks)
+
+
+def run_keras_compile_fit(model_dir,
+                          strategy,
+                          model_fn,
+                          train_input_fn,
+                          eval_input_fn,
+                          loss_fn,
+                          metric_fn,
+                          init_checkpoint,
+                          epochs,
+                          steps_per_epoch,
+                          steps_per_loop,
+                          eval_steps,
+                          training_callbacks=True,
+                          custom_callbacks=None):
+  """Runs BERT classifier model using Keras compile/fit API."""
+
+  with strategy.scope():
+    training_dataset = train_input_fn()
+    evaluation_dataset = eval_input_fn() if eval_input_fn else None
+    bert_model, sub_model = model_fn()
+    optimizer = bert_model.optimizer
+
+    if init_checkpoint:
+      checkpoint = tf.train.Checkpoint(model=sub_model)
+      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+
+    if not isinstance(metric_fn, (list, tuple)):
+      metric_fn = [metric_fn]
+    bert_model.compile(
+        optimizer=optimizer,
+        loss=loss_fn,
+        metrics=[fn() for fn in metric_fn],
+        experimental_steps_per_execution=steps_per_loop)
+
+    summary_dir = os.path.join(model_dir, 'summaries')
+ #   summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
+    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir, profile_batch=0)
+    checkpoint = tf.train.Checkpoint(model=bert_model, optimizer=optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        directory=model_dir,
+        max_to_keep=None,
+        step_counter=optimizer.iterations,
+        checkpoint_interval=0)
+    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
+
+    if training_callbacks:
+      if custom_callbacks is not None:
+        custom_callbacks += [summary_callback, checkpoint_callback]
+      else:
+        custom_callbacks = [summary_callback, checkpoint_callback]
+#xuan
+    #tf.keras.callbacks.TerminateOnNaN(custom_callbacks)
+    history = bert_model.fit(
+        x=training_dataset,
+        validation_data=evaluation_dataset,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        validation_steps=eval_steps,
+        callbacks=custom_callbacks)
+    stats = {'total_training_steps': steps_per_epoch * epochs}
+    if 'loss' in history.history:
+      stats['train_loss'] = history.history['loss'][-1]
+    if 'val_accuracy' in history.history:
+      stats['eval_metrics'] = history.history['val_accuracy'][-1]
+    return bert_model, stats
+
+
+def get_predictions_and_labels(strategy,
+                               trained_model,
+                               eval_input_fn,
+                               return_probs=False):
+  """Obtains predictions of trained model on evaluation data.
+
+  Note that list of labels is returned along with the predictions because the
+  order changes on distributing dataset over TPU pods.
+
+  Args:
+    strategy: Distribution strategy.
+    trained_model: Trained model with preloaded weights.
+    eval_input_fn: Input function for evaluation data.
+    return_probs: Whether to return probabilities of classes.
+
+  Returns:
+    predictions: List of predictions.
+    labels: List of gold labels corresponding to predictions.
+  """
+
+  @tf.function
+  def test_step(iterator):
+    """Computes predictions on distributed devices."""
+
+    def _test_step_fn(inputs):
+      """Replicated predictions."""
+      inputs, labels = inputs
+      logits = trained_model(inputs, training=False)
+      probabilities = tf.nn.softmax(logits)
+      return probabilities, labels
+
+    outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
+    # outputs: current batch logits as a tuple of shard logits
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    outputs)
+    labels = tf.nest.map_structure(strategy.experimental_local_results, labels)
+    return outputs, labels
+
+  def _run_evaluation(test_iterator):
+    """Runs evaluation steps."""
+    preds, golds = list(), list()
+    try:
+      with tf.experimental.async_scope():
+        while True:
+          probabilities, labels = test_step(test_iterator)
+          for cur_probs, cur_labels in zip(probabilities, labels):
+            if return_probs:
+              preds.extend(cur_probs.numpy().tolist())
+            else:
+              preds.extend(tf.math.argmax(cur_probs, axis=1).numpy())
+            golds.extend(cur_labels.numpy().tolist())
+    except (StopIteration, tf.errors.OutOfRangeError):
+      tf.experimental.async_clear_error()
+    return preds, golds
+
+  test_iter = iter(
+      strategy.experimental_distribute_datasets_from_function(eval_input_fn))
+  predictions, labels = _run_evaluation(test_iter)
+
+  return predictions, labels
+
+
+def export_classifier(model_export_path, input_meta_data, bert_config,
+                      model_dir):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+    bert_config: Bert configuration file to define core bert layers.
+    model_dir: The directory where the model weights and training/evaluation
+      summaries are stored.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  if not model_export_path:
+    raise ValueError('Export path is not specified: %s' % model_export_path)
+  if not model_dir:
+    raise ValueError('Export path is not specified: %s' % model_dir)
+
+  # Export uses float32 for now, even if training uses mixed precision.
+  tf.keras.mixed_precision.experimental.set_policy('float32')
+  classifier_model = bert_models.classifier_model(
+      bert_config, input_meta_data.get('num_labels', 1))[0]
+
+  model_saving_utils.export_bert_model(
+      model_export_path, model=classifier_model, checkpoint_dir=model_dir)
+
+
+def run_bert(strategy,
+             input_meta_data,
+             model_config,
+             train_input_fn=None,
+             eval_input_fn=None,
+             init_checkpoint=None,
+             custom_callbacks=None,
+             custom_metrics=None):
+  """Run BERT training."""
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_session_config(FLAGS.enable_xla)
+  performance.set_mixed_precision_policy(common_flags.dtype())
+
+  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
+  train_data_size = (
+      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
+  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
+  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
+  eval_steps = int(
+      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+
+  if not strategy:
+    raise ValueError('Distribution strategy has not been specified.')
+
+  if not custom_callbacks:
+    custom_callbacks = []
+
+  if FLAGS.log_steps:
+    custom_callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size=FLAGS.train_batch_size,
+            log_steps=FLAGS.log_steps,
+            logdir=FLAGS.model_dir))
+
+  trained_model, _ = run_bert_classifier(
+      strategy,
+      model_config,
+      input_meta_data,
+      FLAGS.model_dir,
+      epochs,
+      steps_per_epoch,
+      FLAGS.steps_per_loop,
+      eval_steps,
+      warmup_steps,
+      FLAGS.learning_rate,
+      init_checkpoint or FLAGS.init_checkpoint,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+
+  if FLAGS.model_export_path:
+    model_saving_utils.export_bert_model(
+        FLAGS.model_export_path, model=trained_model)
+  return trained_model
+
+
+def custom_main(custom_callbacks=None, custom_metrics=None):
+  """Run classification or regression.
+
+  Args:
+    custom_callbacks: list of tf.keras.Callbacks passed to training loop.
+    custom_metrics: list of metrics passed to the training loop.
+  """
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+  physical_devices = tf.config.list_physical_devices('GPU')
+  if len(physical_devices) > 0:
+      for device in physical_devices:
+          tf.config.experimental.set_memory_growth(device, True)
+          print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
+  else:
+      print("Not enough GPU hardware devices available")
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  label_type = LABEL_TYPES_MAP[input_meta_data.get('label_type', 'int')]
+  include_sample_weights = input_meta_data.get('has_sample_weights', False)
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.mode == 'export_only':
+    export_classifier(FLAGS.model_export_path, input_meta_data, bert_config,
+                      FLAGS.model_dir)
+    return
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg="nccl",
+      num_packs=1,
+      tpu_address=FLAGS.tpu)
+  eval_input_fn = get_dataset_fn(
+      FLAGS.eval_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.eval_batch_size,
+      is_training=False,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights)
+
+  if FLAGS.mode == 'predict':
+    with strategy.scope():
+      classifier_model = bert_models.classifier_model(
+          bert_config, input_meta_data['num_labels'])[0]
+      checkpoint = tf.train.Checkpoint(model=classifier_model)
+      latest_checkpoint_file = (
+          FLAGS.predict_checkpoint_path or
+          tf.train.latest_checkpoint(FLAGS.model_dir))
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+      preds, _ = get_predictions_and_labels(
+          strategy, classifier_model, eval_input_fn, return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    return
+
+  if FLAGS.mode != 'train_and_eval':
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  train_input_fn = get_dataset_fn(
+      FLAGS.train_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.train_batch_size,
+      is_training=True,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights)
+  run_bert(
+      strategy,
+      input_meta_data,
+      bert_config,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+
+
+def main(_):
+  custom_main(custom_callbacks=None, custom_metrics=None)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)