初始化仓库

8a802023 · hepj987 · 8a802023 · 8a802023 · 8a802023 · 8a802023
Commit 8a802023 authored Jul 03, 2023 by hepj987
20 changed files
--- a/README.md
+++ b/README.md
+# 测试前准备
+## 1.数据集准备
+GLUE数据集下载https://pan.baidu.com/s/1tLd8opr08Nw5PzUBh7lXsQ
+分类使用其中的MNLI数据集
+提取码：fyvy
+问答数据：
+[train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+[dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+[evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+## 2.环境部署
+```
+virtualenv -p python3 -system-site-packages venv_2
+source venv_2/bin/activat
+```
+安装python依赖包
+```
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install tensorflow-2.7.0-cp36-cp36m-linux_x86_64.whl
+pip install horovod-0.21.3-cp36-cp36m-linux_x86_64.whl
+pip install apex-0.1-cp36-cp36m-linux_x86_64.whl
+```
+环境变量设置
+```
+module rm compiler/rocm/2.9
+export ROCM_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1
+export HIP_PATH=${ROCM_PATH}/hip
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
+```
+##  3.MNLI分类测试
+###  3.1单卡测试（单精度）
+####  3.1.1数据转化
+TF2.0版本读取数据方式与TF1.0不同，需要转化为tf_record格式
+```
+python ../data/create_finetuning_data.py \
+ --input_data_dir=/public/home/hepj/data/MNLI \
+ --vocab_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/vocab.txt \
+ --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+ --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
+ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
+ --fine_tuning_task_type=classification 
+ --max_seq_length=32 \
+ --classification_task_name=MNLI
+```
+#### 3.1.2   模型转化
+TF2.7.2与TF1.15.0模型存储、读取格式不同，官网给出的Bert一般是基于TF1.0的模型需要进行模型转化
+```
+python3 tf2_encoder_checkpoint_converter.py \
+--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
+--checkpoint_to_convert /public/home/hepjl/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
+--converted_checkpoint_path pre_tf2x/
+```
+#### 3.1.3    bert_class.sh
+```
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_classifier.py \
+  --mode=train_and_eval \
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+  --eval_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
+  --bert_config_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_config.json \
+  --init_checkpoint=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_model.ckpt \
+  --train_batch_size= 320 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=/public/home/hepj/model/tf2/out1 \
+  --distribution_strategy=mirrored
+```
+#### 3.1.4  运行
+sh bert_class.sh
+### 3.2    四卡测试（单精度）
+#### 3.2.1.     数据转化
+与单卡相同（3.1.1）
+####  3.2.2.     模型转化
+与单卡相同（3.1.2）
+#### 3.2.3.   bert_class4.sh
+```
+#这里的--train_batch_size为global train_batch_size
+#使用mpirun的方式启动多卡存在一些问题
+export HIP_VISIBLE_DEVICES=0,1,2,3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_classifier.py \
+  --mode=train_and_eval \
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data  \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+  --eval_data_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record  \
+  --bert_config_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_config.json \
+  --init_checkpoint=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/bert_model.ckpt \
+  --train_batch_size=1280 \
+  --eval_batch_size=32 \
+  --steps_per_loop=10 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --num_gpus=4 \
+  --model_dir=/public/home/hepj/outdir/tf2/class4 \
+  --distribution_strategy=mirrored
+```
+#### 3.2.4.     运行
+```
+sh bert_class4.sh
+```
+##  4. SQUAD1.1问答测试
+### 4.1.     单卡测试（单精度)
+#### 4.1.1.     数据转化
+```
+python3 create_finetuning_data.py \
+ --squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \
+ --vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
+ --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \
+ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data_new \
+ --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \
+ --fine_tuning_task_type=squad \
+ --do_lower_case=Flase \
+ --max_seq_length=384
+```
+#### 4.1.2.     模型转化
+```
+python3 tf2_encoder_checkpoint_converter.py \
+--bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
+--checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \
+--converted_checkpoint_path  /public/home/hepj/model_source/bert-large-uncased-TF2/
+```
+#### 4.1.3.     bert_squad.sh
+```
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+python3 run_squad_xuan.py \
+--mode=train_and_eval \
+--vocab_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/vocab.txt \
+--bert_config_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
+--input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
+--train_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
+--predict_file=/public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
+--init_checkpoint=/public/home/hepj/model_source/bert-large-uncased-TF2/bert_model.ckpt \
+--train_batch_size=4 \
+--predict_batch_size=4 \
+--learning_rate=2e-5 \
+--log_steps=1 \
+--num_gpus=1 \
+--distribution_strategy=mirrored \
+--model_dir=/public/home/hepj/model/tf2/squad1 \
+--run_eagerly=False
+```
+#### 4.1.4.     运行
+```
+sh bert_squad.sh
+```
+### 4.2.     四卡测试（单精度）
+#### 4.2.1.     数据转化
+与单卡相同（4.1.1）
+#### 4.2.2.     模型转化
+与单卡相同（4.1.2）
+#### 4.2.3.     bert_squad4.sh
+```
+#这里的--train_batch_size为global train_batch_size
+#使用mpirun的方式启动多卡存在一些问题
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_squad_xuan.py \
+  --mode=train_and_eval \
+  --vocab_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/vocab.txt \ 
+  --bert_config_file=/public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \ 
+  --input_meta_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data  \
+  --train_data_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record  \
+  --predict_file=/public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \ 
+  --init_checkpoint=/public/home/hepj/model_source/bert-large-uncased-TF2/bert_model.ckpt \ 
+  --train_batch_size=16 \
+  --predict_batch_size=4 \
+  --learning_rate=2e-5 \
+  --log_steps=1 \
+  --num_gpus=4 \
+  --distribution_strategy=mirrored \
+  --model_dir=/public/home/hepj/outdir/tf2/squad4 \
+  --run_eagerly=False
+```
+#### 4.2.4.     运行
+```
+sh bert_squad4.sh
+```
--- a/README_old.md
+++ b/README_old.md
+# BERT (Bidirectional Encoder Representations from Transformers)
+The academic paper which describes BERT in detail and provides full results on a
+number of tasks can be found here: https://arxiv.org/abs/1810.04805.
+This repository contains TensorFlow 2.x implementation for BERT.
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+## Pre-trained Models
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
+released in TF 1.x official BERT repository
+[google-research/bert](https://github.com/google-research/bert)
+in order to keep consistent with BERT paper.
+### Access to Pretrained Checkpoints
+Pretrained checkpoints can be found in the following links:
+**Note: We have switched BERT implementation
+to use Keras functional-style networks in [nlp/modeling](../modeling).
+The new checkpoints are:**
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+### Restoring from Checkpoints
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+### Access to Pretrained hub modules.
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/1)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/1)**:
+    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
+    110M parameters
+## Set Up
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+Install `tf-nightly` to get latest updates:
+```shell
+pip install tf-nightly-gpu
+```
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+Second, you need to install TF 2 `tf-nightly` on your VM:
+```shell
+pip install tf-nightly
+```
+## Process Datasets
+### Pre-training
+There is no change to generate pre-training data. Please use the script
+[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
+which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
+to get processed pre-training data and it adapts to TF2 symbols and python3
+compatibility.
+### Fine-tuning
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+* GLUE
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
+```shell
+export GLUE_DIR=~/glue
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
+```
+* SQUAD
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+The necessary files can be found here:
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384
+```
+## Fine-tuning with BERT
+### Cloud GPUs and TPUs
+* Cloud Storage
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+* GPU -> TPU
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+### Sentence and Sentence-pair Classification Tasks
+This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
+few minutes on most GPUs.
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
+After training a model, to get predictions from the classifier, you can set the
+`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
+Output will be created in file called test_results.tsv in the output folder.
+Each line will contain output for each sample, columns are the class
+probabilities.
+```shell
+python run_classifier.py \
+  --mode='predict' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --eval_batch_size=4 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
+training steps inside a `tf.function` can significantly increase TPU utilization
+and callbacks will not be called inside the loop.
+### SQuAD 1.1
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
+specify a hub module path.
+`run_squad.py` writes the prediction for `--predict_file` by default. If you set
+the `--model=predict` and offer the SQuAD test data, the scripts will generate
+the prediction json file.
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
--- a/__init__.py
+++ b/__init__.py
--- a/assets/vocab.txt
+++ b/assets/vocab.txt
--- a/bert_cloud_tpu.md
+++ b/bert_cloud_tpu.md
+# BERT FineTuning with Cloud TPU: Sentence and Sentence-Pair Classification Tasks (TF 2.1)
+This tutorial shows you how to train the Bidirectional Encoder Representations from Transformers (BERT) model on Cloud TPU.
+## Set up Cloud Storage and Compute Engine VM
+1. [Open a cloud shell window](https://console.cloud.google.com/?cloudshell=true&_ga=2.11844148.-1612541229.1552429951)
+2. Create a variable for the project's name:
+```
+export PROJECT_NAME=your-project_name
+```
+3. Configure `gcloud` command-line tool to use the project where you want to create Cloud TPU.
+```
+gcloud config set project ${PROJECT_NAME}
+```
+4. Create a Cloud Storage bucket using the following command:
+```
+gsutil mb -p ${PROJECT_NAME} -c standard -l europe-west4 -b on gs://your-bucket-name
+```
+This Cloud Storage bucket stores the data you use to train your model and the training results.
+5. Launch a Compute Engine VM and Cloud TPU using the ctpu up command.
+```
+ctpu up --tpu-size=v3-8 \
+ --machine-type=n1-standard-8 \
+ --zone=europe-west4-a \
+ --tf-version=2.1 [optional flags: --project, --name]
+```
+6. The configuration you specified appears. Enter y to approve or n to cancel.
+7. When the ctpu up command has finished executing, verify that your shell prompt has changed from username@project to username@tpuname. This change shows that you are now logged into your Compute Engine VM.
+```
+gcloud compute ssh vm-name --zone=europe-west4-a
+(vm)$ export TPU_NAME=vm-name
+```
+As you continue these instructions, run each command that begins with `(vm)$` in your VM session window.
+## Prepare the Dataset
+1. From your Compute Engine virtual machine (VM), install requirements.txt.
+```
+(vm)$ cd /usr/share/models
+(vm)$ sudo pip3 install -r official/requirements.txt
+```
+2. Optional: download download_glue_data.py
+This tutorial uses the General Language Understanding Evaluation (GLUE) benchmark to evaluate and analyze the performance of the model. The GLUE data is provided for this tutorial at gs://cloud-tpu-checkpoints/bert/classification.
+## Define parameter values
+Next, define several parameter values that are required when you train and evaluate your model:
+```
+(vm)$ export PYTHONPATH="$PYTHONPATH:/usr/share/tpu/models"
+(vm)$ export STORAGE_BUCKET=gs://your-bucket-name
+(vm)$ export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+(vm)$ export MODEL_DIR=${STORAGE_BUCKET}/bert-output
+(vm)$ export GLUE_DIR=gs://cloud-tpu-checkpoints/bert/classification
+(vm)$ export TASK=mnli
+```
+## Train the model
+From your Compute Engine VM, run the following command.
+```
+(vm)$ python3 official/nlp/bert/run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=${TPU_NAME}
+```
+## Verify your results
+The training takes approximately 1 hour on a v3-8 TPU. When script completes, you should see results similar to the following:
+```
+Training Summary:
+{'train_loss': 0.28142181038856506,
+'last_train_metrics': 0.9467429518699646,
+'eval_metrics': 0.8599063158035278,
+'total_training_steps': 36813}
+```
+## Clean up
+To avoid incurring charges to your GCP account for the resources used in this topic:
+1. Disconnect from the Compute Engine VM:
+```
+(vm)$ exit
+```
+2. In your Cloud Shell, run ctpu delete with the --zone flag you used when you set up the Cloud TPU to delete your Compute Engine VM and your Cloud TPU:
+```
+$ ctpu delete --zone=your-zone
+```
+3. Run ctpu status specifying your zone to make sure you have no instances allocated to avoid unnecessary charges for TPU usage. The deletion might take several minutes. A response like the one below indicates there are no more allocated instances:
+```
+$ ctpu status --zone=your-zone
+```
+4. Run gsutil as shown, replacing your-bucket with the name of the Cloud Storage bucket you created for this tutorial:
+```
+$ gsutil rm -r gs://your-bucket
+```
--- a/bert_models.py
+++ b/bert_models.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT models that are compatible with TF 2.0."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import gin
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.modeling import tf_utils
+from official.nlp.albert import configs as albert_configs
+from official.nlp.bert import configs
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
+class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
+  """Returns layer that computes custom loss and metrics for pretraining."""
+  def __init__(self, vocab_size, **kwargs):
+    super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self.config = {
+        'vocab_size': vocab_size,
+    }
+  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
+                   lm_example_loss, sentence_output, sentence_labels,
+                   next_sentence_loss):
+    """Adds metrics."""
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        lm_labels, lm_output)
+    numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
+    denominator = tf.reduce_sum(lm_label_weights) + 1e-5
+    masked_lm_accuracy = numerator / denominator
+    self.add_metric(
+        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
+    if sentence_labels is not None:
+      next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+          sentence_labels, sentence_output)
+      self.add_metric(
+          next_sentence_accuracy,
+          name='next_sentence_accuracy',
+          aggregation='mean')
+    if next_sentence_loss is not None:
+      self.add_metric(
+          next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+  def call(self,
+           lm_output_logits,
+           sentence_output_logits,
+           lm_label_ids,
+           lm_label_weights,
+           sentence_labels=None):
+    """Implements call() for the layer."""
+    lm_label_weights = tf.cast(lm_label_weights, tf.float32)
+    lm_output_logits = tf.cast(lm_output_logits, tf.float32)
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        lm_label_ids, lm_output_logits, from_logits=True)
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mask_label_loss = tf.math.divide_no_nan(lm_numerator_loss,
+                                            lm_denominator_loss)
+    if sentence_labels is not None:
+      sentence_output_logits = tf.cast(sentence_output_logits, tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels, sentence_output_logits, from_logits=True)
+      sentence_loss = tf.reduce_mean(sentence_loss)
+      loss = mask_label_loss + sentence_loss
+    else:
+      sentence_loss = None
+      loss = mask_label_loss
+    batch_shape = tf.slice(tf.shape(lm_label_ids), [0], [1])
+    # TODO(hongkuny): Avoids the hack and switches add_loss.
+    final_loss = tf.fill(batch_shape, loss)
+    self._add_metrics(lm_output_logits, lm_label_ids, lm_label_weights,
+                      mask_label_loss, sentence_output_logits, sentence_labels,
+                      sentence_loss)
+    return final_loss
+@gin.configurable
+def get_transformer_encoder(bert_config,
+                            sequence_length,
+                            transformer_encoder_cls=None,
+                            output_range=None):
+  """Gets a 'TransformerEncoder' object.
+  Args:
+    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
+    sequence_length: Maximum sequence length of the training data.
+    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
+      default BERT encoder implementation.
+    output_range: the sequence output range, [0, output_range). Default setting
+      is to return the entire sequence output.
+  Returns:
+    A networks.TransformerEncoder object.
+  """
+  if transformer_encoder_cls is not None:
+    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
+    embedding_cfg = dict(
+        vocab_size=bert_config.vocab_size,
+        type_vocab_size=bert_config.type_vocab_size,
+        hidden_size=bert_config.hidden_size,
+        seq_length=sequence_length,
+        max_seq_length=bert_config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+        dropout_rate=bert_config.hidden_dropout_prob,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=bert_config.num_attention_heads,
+        intermediate_size=bert_config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
+        dropout_rate=bert_config.hidden_dropout_prob,
+        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=bert_config.num_hidden_layers,
+        pooled_output_dim=bert_config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range))
+    # Relies on gin configuration to define the Transformer encoder arguments.
+    return transformer_encoder_cls(**kwargs)
+  kwargs = dict(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      sequence_length=sequence_length,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      embedding_width=bert_config.embedding_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range))
+  if isinstance(bert_config, albert_configs.AlbertConfig):
+    return networks.AlbertTransformerEncoder(**kwargs)
+  else:
+    assert isinstance(bert_config, configs.BertConfig)
+    kwargs['output_range'] = output_range
+    return networks.TransformerEncoder(**kwargs)
+def pretrain_model(bert_config,
+                   seq_length,
+                   max_predictions_per_seq,
+                   initializer=None,
+                   use_next_sentence_label=True,
+                   return_core_pretrainer_model=False):
+  """Returns model to be used for pre-training.
+  Args:
+      bert_config: Configuration that defines the core BERT model.
+      seq_length: Maximum sequence length of the training data.
+      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
+        and use for pretraining.
+      initializer: Initializer for weights in BertPretrainer.
+      use_next_sentence_label: Whether to use the next sentence label.
+      return_core_pretrainer_model: Whether to also return the `BertPretrainer`
+        object.
+  Returns:
+      A Tuple of (1) Pretraining model, (2) core BERT submodel from which to
+      save weights after pretraining, and (3) optional core `BertPretrainer`
+      object if argument `return_core_pretrainer_model` is True.
+  """
+  input_word_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_mask', dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  masked_lm_positions = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_positions',
+      dtype=tf.int32)
+  masked_lm_ids = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_weights',
+      dtype=tf.int32)
+  if use_next_sentence_label:
+    next_sentence_labels = tf.keras.layers.Input(
+        shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+  else:
+    next_sentence_labels = None
+  transformer_encoder = get_transformer_encoder(bert_config, seq_length)
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  pretrainer_model = models.BertPretrainer(
+      network=transformer_encoder,
+      embedding_table=transformer_encoder.get_embedding_table(),
+      num_classes=2,  # The next sentence prediction label has two classes.
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      num_token_predictions=max_predictions_per_seq,
+      initializer=initializer,
+      output='logits')
+  outputs = pretrainer_model(
+      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+  lm_output = outputs['masked_lm']
+  sentence_output = outputs['classification']
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
+      vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
+                                    masked_lm_weights, next_sentence_labels)
+  inputs = {
+      'input_word_ids': input_word_ids,
+      'input_mask': input_mask,
+      'input_type_ids': input_type_ids,
+      'masked_lm_positions': masked_lm_positions,
+      'masked_lm_ids': masked_lm_ids,
+      'masked_lm_weights': masked_lm_weights,
+  }
+  if use_next_sentence_label:
+    inputs['next_sentence_labels'] = next_sentence_labels
+  keras_model = tf.keras.Model(inputs=inputs, outputs=output_loss)
+  if return_core_pretrainer_model:
+    return keras_model, transformer_encoder, pretrainer_model
+  else:
+    return keras_model, transformer_encoder
+def squad_model(bert_config,
+                max_seq_length,
+                initializer=None,
+                hub_module_url=None,
+                hub_module_trainable=True):
+  """Returns BERT Squad model along with core BERT model to import weights.
+  Args:
+    bert_config: BertConfig, the config defines the core Bert model.
+    max_seq_length: integer, the maximum input sequence length.
+    initializer: Initializer for the final dense layer in the span labeler.
+      Defaulted to TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+  Returns:
+    A tuple of (1) keras model that outputs start logits and end logits and
+    (2) the core BERT transformer encoder.
+  """
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
+    return models.BertSpanLabeler(
+        network=bert_encoder, initializer=initializer), bert_encoder
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  core_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, sequence_output = core_model(
+      [input_word_ids, input_mask, input_type_ids])
+  bert_encoder = tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids,
+      },
+      outputs=[sequence_output, pooled_output],
+      name='core_model')
+  return models.BertSpanLabeler(
+      network=bert_encoder, initializer=initializer), bert_encoder
+def classifier_model(bert_config,
+                     num_labels,
+                     max_seq_length=None,
+                     final_layer_initializer=None,
+                     hub_module_url=None,
+                     hub_module_trainable=True):
+  """BERT classifier model in functional API style.
+  Construct a Keras model for predicting `num_labels` outputs from an input with
+  maximum sequence length `max_seq_length`.
+  Args:
+    bert_config: BertConfig or AlbertConfig, the config defines the core BERT or
+      ALBERT model.
+    num_labels: integer, the number of classes.
+    max_seq_length: integer, the maximum input sequence length.
+    final_layer_initializer: Initializer for final dense layer. Defaulted
+      TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+  Returns:
+    Combined prediction model (words, mask, type) -> (one-hot labels)
+    BERT sub-model (words, mask, type) -> (bert_outputs)
+  """
+  if final_layer_initializer is not None:
+    initializer = final_layer_initializer
+  else:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(
+        bert_config, max_seq_length, output_range=1)
+    return models.BertClassifier(
+        bert_encoder,
+        num_classes=num_labels,
+        dropout_rate=bert_config.hidden_dropout_prob,
+        initializer=initializer), bert_encoder
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
+      pooled_output)
+  output = tf.keras.layers.Dense(
+      num_labels, kernel_initializer=initializer, name='output')(
+          output)
+  return tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids
+      },
+      outputs=output), bert_model
--- a/bert_models_test.py
+++ b/bert_models_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.nlp.bert import bert_models
+from official.nlp.bert import configs as bert_configs
+from official.nlp.modeling import networks
+class BertModelsTest(tf.test.TestCase):
+  def setUp(self):
+    super(BertModelsTest, self).setUp()
+    self._bert_test_config = bert_configs.BertConfig(
+        attention_probs_dropout_prob=0.0,
+        hidden_act='gelu',
+        hidden_dropout_prob=0.0,
+        hidden_size=16,
+        initializer_range=0.02,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=2,
+        type_vocab_size=2,
+        vocab_size=30522)
+  def test_pretrain_model(self):
+    model, encoder = bert_models.pretrain_model(
+        self._bert_test_config,
+        seq_length=5,
+        max_predictions_per_seq=2,
+        initializer=None,
+        use_next_sentence_label=True)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(encoder, networks.TransformerEncoder)
+    # model has one scalar output: loss value.
+    self.assertEqual(model.output.shape.as_list(), [None,])
+    # Expect two output from encoder: sequence and classification output.
+    self.assertIsInstance(encoder.output, list)
+    self.assertLen(encoder.output, 2)
+    # shape should be [batch size, seq_length, hidden_size]
+    self.assertEqual(encoder.output[0].shape.as_list(), [None, 5, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(encoder.output[1].shape.as_list(), [None, 16])
+  def test_squad_model(self):
+    model, core_model = bert_models.squad_model(
+        self._bert_test_config,
+        max_seq_length=5,
+        initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+    # Expect two output from model: start positions and end positions
+    self.assertIsInstance(model.output, list)
+    self.assertLen(model.output, 2)
+    # shape should be [batch size, seq_length]
+    self.assertEqual(model.output[0].shape.as_list(), [None, 5])
+    # shape should be [batch size, seq_length]
+    self.assertEqual(model.output[1].shape.as_list(), [None, 5])
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, seq_length, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, 5, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+  def test_classifier_model(self):
+    model, core_model = bert_models.classifier_model(
+        self._bert_test_config,
+        num_labels=3,
+        max_seq_length=5,
+        final_layer_initializer=None,
+        hub_module_url=None,
+        hub_module_trainable=None)
+    self.assertIsInstance(model, tf.keras.Model)
+    self.assertIsInstance(core_model, tf.keras.Model)
+    # model has one classification output with num_labels=3.
+    self.assertEqual(model.output.shape.as_list(), [None, 3])
+    # Expect two output from core_model: sequence and classification output.
+    self.assertIsInstance(core_model.output, list)
+    self.assertLen(core_model.output, 2)
+    # shape should be [batch size, 1, hidden_size]
+    self.assertEqual(core_model.output[0].shape.as_list(), [None, 1, 16])
+    # shape should be [batch size, hidden_size]
+    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
+if __name__ == '__main__':
+  tf.test.main()
--- a/common_flags.py
+++ b/common_flags.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defining common flags used across all BERT models/applications."""
+from absl import flags
+import tensorflow as tf
+from official.utils import hyperparams_flags
+from official.utils.flags import core as flags_core
+def define_common_bert_flags():
+  """Define common flags for BERT tasks."""
+  flags_core.define_base(
+      data_dir=False,
+      model_dir=True,
+      clean=False,
+      train_epochs=False,
+      epochs_between_evals=False,
+      stop_threshold=False,
+      batch_size=False,
+      num_gpu=True,
+      export_dir=False,
+      distribution_strategy=True,
+      run_eagerly=True)
+  flags_core.define_distribution()
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string(
+      'model_export_path', None,
+      'Path to the directory, where trainined model will be '
+      'exported.')
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_string(
+      'init_checkpoint', None,
+      'Initial checkpoint (usually from a pre-trained BERT model).')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', None,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside. If not set the value will be configured depending on the '
+      'devices available.')
+  flags.DEFINE_float('learning_rate', 5e-5,
+                     'The initial learning rate for Adam.')
+  flags.DEFINE_float('end_lr', 0.0,
+                     'The end learning rate for learning rate decay.')
+  flags.DEFINE_string('optimizer_type', 'adamw',
+                      'The type of optimizer to use for training (adamw|lamb)')
+  flags.DEFINE_boolean(
+      'scale_loss', False,
+      'Whether to divide the loss by number of replica inside the per-replica '
+      'loss function.')
+  flags.DEFINE_boolean(
+      'use_keras_compile_fit', False,
+      'If True, uses Keras compile/fit() API for training logic. Otherwise '
+      'use custom training loop.')
+  flags.DEFINE_string(
+      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
+      'If specified, init_checkpoint flag should not be used.')
+  flags.DEFINE_bool('hub_module_trainable', True,
+                    'True to make keras layers in the hub module trainable.')
+  flags.DEFINE_string('sub_model_export_name', None,
+                      'If set, `sub_model` checkpoints are exported into '
+                      'FLAGS.model_dir/FLAGS.sub_model_export_name.')
+  flags_core.define_log_steps()
+  # Adds flags for mixed precision and multi-worker training.
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=False,
+      max_train_steps=False,
+      dtype=True,
+      dynamic_loss_scale=True,
+      loss_scale=True,
+      all_reduce_alg=True,
+      num_packs=False,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      enable_xla=True,
+      fp16_implementation=True,
+  )
+  # Adds gin configuration flags.
+  hyperparams_flags.define_gin_flags()
+def dtype():
+  return flags_core.get_tf_dtype(flags.FLAGS)
+def use_float16():
+  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
+def use_graph_rewrite():
+  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
+def get_loss_scale():
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
--- a/configs.py
+++ b/configs.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The main BERT model and related functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import json
+import six
+import tensorflow as tf
+class BertConfig(object):
+  """Configuration for `BertModel`."""
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02,
+               embedding_size=None,
+               backward_compatible=True):
+    """Constructs BertConfig.
+    Args:
+      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+      hidden_size: Size of the encoder layers and the pooler layer.
+      num_hidden_layers: Number of hidden layers in the Transformer encoder.
+      num_attention_heads: Number of attention heads for each attention layer in
+        the Transformer encoder.
+      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        layer in the Transformer encoder.
+      hidden_act: The non-linear activation function (function or string) in the
+        encoder and pooler.
+      hidden_dropout_prob: The dropout probability for all fully connected
+        layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob: The dropout ratio for the attention
+        probabilities.
+      max_position_embeddings: The maximum sequence length that this model might
+        ever be used with. Typically set this to something large just in case
+        (e.g., 512 or 1024 or 2048).
+      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+        `BertModel`.
+      initializer_range: The stdev of the truncated_normal_initializer for
+        initializing all weight matrices.
+      embedding_size: (Optional) width of the factorized word embeddings.
+      backward_compatible: Boolean, whether the variables shape are compatible
+        with checkpoints converted from TF 1.x BERT.
+    """
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.embedding_size = embedding_size
+    self.backward_compatible = backward_compatible
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    config = BertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
+  @classmethod
+  def from_json_file(cls, json_file):
+    """Constructs a `BertConfig` from a json file of parameters."""
+    with tf.io.gfile.GFile(json_file, "r") as reader:
+      text = reader.read()
+    return cls.from_dict(json.loads(text))
+  def to_dict(self):
+    """Serializes this instance to a Python dictionary."""
+    output = copy.deepcopy(self.__dict__)
+    return output
+  def to_json_string(self):
+    """Serializes this instance to a JSON string."""
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
--- a/dcu1_sauad
+++ b/dcu1_sauad
+nohup: ignoring input
+/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/bin/python3
+/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/lib/python3.6/site-packages/absl/flags/_validators.py:359: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
+  'command line!' % flag_name)
+2021-04-20 13:52:19.004111: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libamdhip64.so
+2021-04-20 13:52:21.340214: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 0 with properties: 
+pciBusID: 0000:04:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:21.340441: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 1 with properties: 
+pciBusID: 0000:26:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:21.340561: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 2 with properties: 
+pciBusID: 0000:43:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:21.340675: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 3 with properties: 
+pciBusID: 0000:63:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:22.094548: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocblas.so
+2021-04-20 13:52:22.625555: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libMIOpen.so
+2021-04-20 13:52:24.123987: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocfft.so
+2021-04-20 13:52:24.244932: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocrand.so
+2021-04-20 13:52:24.246858: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0, 1, 2, 3
+2021-04-20 13:52:24.541775: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 1999885000 Hz
+2021-04-20 13:52:24.546191: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x500d950 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
+2021-04-20 13:52:24.546299: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
+2021-04-20 13:52:24.593943: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x507a4c0 initialized for platform ROCM (this does not guarantee that XLA will be used). Devices:
+2021-04-20 13:52:24.594019: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Device 66a1, AMDGPU ISA version: gfx906
+2021-04-20 13:52:24.594056: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): Device 66a1, AMDGPU ISA version: gfx906
+2021-04-20 13:52:24.594089: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): Device 66a1, AMDGPU ISA version: gfx906
+2021-04-20 13:52:24.594121: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (3): Device 66a1, AMDGPU ISA version: gfx906
+2021-04-20 13:52:29.917480: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 0 with properties: 
+pciBusID: 0000:04:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:29.917676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 1 with properties: 
+pciBusID: 0000:26:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:29.917799: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 2 with properties: 
+pciBusID: 0000:43:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:29.917916: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1734] Found device 3 with properties: 
+pciBusID: 0000:63:00.0 name: Device 66a1     ROCm AMD GPU ISA: gfx906
+coreClock: 1.7GHz coreCount: 64 deviceMemorySize: 15.98GiB deviceMemoryBandwidth: 953.67GiB/s
+2021-04-20 13:52:29.917992: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocblas.so
+2021-04-20 13:52:29.918038: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libMIOpen.so
+2021-04-20 13:52:29.918082: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocfft.so
+2021-04-20 13:52:29.918125: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library librocrand.so
+2021-04-20 13:52:29.918676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0, 1, 2, 3
+2021-04-20 13:52:29.920415: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
+2021-04-20 13:52:29.920463: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263]      0 1 2 3 
+2021-04-20 13:52:29.920498: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0:   N Y Y Y 
+2021-04-20 13:52:29.920530: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 1:   Y N Y Y 
+2021-04-20 13:52:29.920564: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 2:   Y Y N Y 
+2021-04-20 13:52:29.920596: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 3:   Y Y Y N 
+2021-04-20 13:52:29.932072: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 15385 MB memory) -> physical GPU (device: 0, name: Device 66a1, pci bus id: 0000:04:00.0)
+2021-04-20 13:52:29.939375: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 15385 MB memory) -> physical GPU (device: 1, name: Device 66a1, pci bus id: 0000:26:00.0)
+2021-04-20 13:52:29.944123: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 15385 MB memory) -> physical GPU (device: 2, name: Device 66a1, pci bus id: 0000:43:00.0)
+2021-04-20 13:52:29.948151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 15385 MB memory) -> physical GPU (device: 3, name: Device 66a1, pci bus id: 0000:63:00.0)
+I0420 13:52:29.980878 47739561595328 mirrored_strategy.py:341] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
+I0420 13:52:30.059932 47739561595328 run_squad_helper.py:236] Training using customized training loop with distribution strategy.
+W0420 13:52:30.061461 47739561595328 device_compatibility_check.py:111] Mixed precision compatibility check (mixed_float16): WARNING
+Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
+  Device 66a1, no compute capability (probably not an Nvidia GPU) (x4)
+See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
+If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
+W0420 13:52:30.061690 47739561595328 deprecation.py:323] From /public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/run_squad_helper.py:295: run_customized_training_loop (from official.nlp.bert.model_training_utils) is deprecated and will be removed in a future version.
+Instructions for updating:
+This function is deprecated. Please use Keras compile/fit instead.
+I0420 13:52:30.061841 47739561595328 model_training_utils.py:228] steps_per_loop not specified. Using steps_per_loop=1
+2021-04-20 13:52:31.177854: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.353766: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.362167: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.367682: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.373594: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.379299: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.385433: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.391268: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+2021-04-20 13:52:31.397368: I tensorflow/core/common_runtime/gpu_fusion_pass.cc:508] ROCm Fusion is enabled.
+I0420 13:52:37.452952 47739561595328 optimization.py:89] using Adamw optimizer
+I0420 13:52:37.461878 47739561595328 model_training_utils.py:273] Checkpoint file /public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/pre_tf2x/bert_model.ckpt found and restoring from initial checkpoint for core model.
+I0420 13:52:43.295127 47739561595328 model_training_utils.py:276] Loading from checkpoint file completed
+I0420 13:52:43.351998 47739561595328 model_training_utils.py:448] Checkpoint file /public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/model_squad/ctl_step_2100.ckpt-3 found and restoring from checkpoint
+I0420 13:52:49.726183 47739561595328 model_training_utils.py:450] Loading from checkpoint file completed
+I0420 13:52:52.106794 47739561595328 model_training_utils.py:49] Saving model as TF checkpoint: /public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/model_squad/ctl_step_2100.ckpt-4
+PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True
+PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU') memory growth: True
+PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU') memory growth: True
+PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU') memory growth: True
+Traceback (most recent call last):
+  File "run_squad_xuan.py", line 163, in <module>
+    app.run(main)
+  File "/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/lib/python3.6/site-packages/absl/app.py", line 300, in run
+    _run_main(main, args)
+  File "/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/lib/python3.6/site-packages/absl/app.py", line 251, in _run_main
+    sys.exit(main(argv))
+  File "run_squad_xuan.py", line 140, in main
+    sub_model_export_name=FLAGS.sub_model_export_name,
+  File "run_squad_xuan.py", line 60, in train_squad
+    sub_model_export_name=sub_model_export_name)
+  File "/public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/run_squad_helper.py", line 295, in train_squad
+    post_allreduce_callbacks=[clip_by_global_norm_callback])
+  File "/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 324, in new_func
+    return func(*args, **kwargs)
+  File "/public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/nlp/bert/model_training_utils.py", line 552, in run_customized_training_loop
+    callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+  File "/public/home/xuanbaby/rocm3.9-python3.6.8-tf1.15/lib/python3.6/site-packages/tensorflow/python/keras/callbacks.py", line 416, in on_epoch_end
+    callback.on_epoch_end(epoch, numpy_logs)
+  File "/public/home/xuanbaby/DL-TensorFlow/models_r2.3.0/official/utils/misc/keras_utils.py", line 146, in on_epoch_end
+    epoch_run_time = time.time() - self.epoch_start
+AttributeError: 'TimeHistory' object has no attribute 'epoch_start'
+W0420 13:52:52.709877 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer.beta_1
+W0420 13:52:52.710175 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer.beta_2
+W0420 13:52:52.710281 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer.decay
+W0420 13:52:52.710438 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-0.embeddings
+W0420 13:52:52.710539 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-1.embeddings
+W0420 13:52:52.710636 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-2.embeddings
+W0420 13:52:52.710731 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-3.gamma
+W0420 13:52:52.710827 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-3.beta
+W0420 13:52:52.710921 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-1.layer_with_weights-0.kernel
+W0420 13:52:52.711015 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-1.layer_with_weights-0.bias
+W0420 13:52:52.711109 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_output_dense.kernel
+W0420 13:52:52.711216 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_output_dense.bias
+W0420 13:52:52.711309 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer_norm.gamma
+W0420 13:52:52.711413 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer_norm.beta
+W0420 13:52:52.711504 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._intermediate_dense.kernel
+W0420 13:52:52.711600 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._intermediate_dense.bias
+W0420 13:52:52.711693 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._output_dense.kernel
+W0420 13:52:52.711787 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._output_dense.bias
+W0420 13:52:52.711879 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._output_layer_norm.gamma
+W0420 13:52:52.711973 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._output_layer_norm.beta
+W0420 13:52:52.712068 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_output_dense.kernel
+W0420 13:52:52.712161 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_output_dense.bias
+W0420 13:52:52.712255 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer_norm.gamma
+W0420 13:52:52.712365 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer_norm.beta
+W0420 13:52:52.712460 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._intermediate_dense.kernel
+W0420 13:52:52.712552 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._intermediate_dense.bias
+W0420 13:52:52.712657 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._output_dense.kernel
+W0420 13:52:52.712750 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._output_dense.bias
+W0420 13:52:52.712842 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._output_layer_norm.gamma
+W0420 13:52:52.712936 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._output_layer_norm.beta
+W0420 13:52:52.713029 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_output_dense.kernel
+W0420 13:52:52.713122 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_output_dense.bias
+W0420 13:52:52.713215 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer_norm.gamma
+W0420 13:52:52.713307 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer_norm.beta
+W0420 13:52:52.713410 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._intermediate_dense.kernel
+W0420 13:52:52.713502 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._intermediate_dense.bias
+W0420 13:52:52.713597 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._output_dense.kernel
+W0420 13:52:52.713688 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._output_dense.bias
+W0420 13:52:52.713782 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._output_layer_norm.gamma
+W0420 13:52:52.713875 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._output_layer_norm.beta
+W0420 13:52:52.713967 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_output_dense.kernel
+W0420 13:52:52.714061 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_output_dense.bias
+W0420 13:52:52.714159 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer_norm.gamma
+W0420 13:52:52.714252 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer_norm.beta
+W0420 13:52:52.714363 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._intermediate_dense.kernel
+W0420 13:52:52.714458 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._intermediate_dense.bias
+W0420 13:52:52.714551 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._output_dense.kernel
+W0420 13:52:52.714644 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._output_dense.bias
+W0420 13:52:52.714739 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._output_layer_norm.gamma
+W0420 13:52:52.714832 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._output_layer_norm.beta
+W0420 13:52:52.714924 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_output_dense.kernel
+W0420 13:52:52.715018 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_output_dense.bias
+W0420 13:52:52.715110 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer_norm.gamma
+W0420 13:52:52.715203 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer_norm.beta
+W0420 13:52:52.715296 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._intermediate_dense.kernel
+W0420 13:52:52.715399 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._intermediate_dense.bias
+W0420 13:52:52.715491 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._output_dense.kernel
+W0420 13:52:52.715584 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._output_dense.bias
+W0420 13:52:52.715682 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._output_layer_norm.gamma
+W0420 13:52:52.715775 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._output_layer_norm.beta
+W0420 13:52:52.715867 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_output_dense.kernel
+W0420 13:52:52.715959 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_output_dense.bias
+W0420 13:52:52.716053 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer_norm.gamma
+W0420 13:52:52.716145 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer_norm.beta
+W0420 13:52:52.716246 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._intermediate_dense.kernel
+W0420 13:52:52.716345 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._intermediate_dense.bias
+W0420 13:52:52.716441 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._output_dense.kernel
+W0420 13:52:52.716533 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._output_dense.bias
+W0420 13:52:52.716628 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._output_layer_norm.gamma
+W0420 13:52:52.716719 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._output_layer_norm.beta
+W0420 13:52:52.716812 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_output_dense.kernel
+W0420 13:52:52.716905 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_output_dense.bias
+W0420 13:52:52.716997 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer_norm.gamma
+W0420 13:52:52.717097 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer_norm.beta
+W0420 13:52:52.717193 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._intermediate_dense.kernel
+W0420 13:52:52.717286 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._intermediate_dense.bias
+W0420 13:52:52.717389 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._output_dense.kernel
+W0420 13:52:52.717484 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._output_dense.bias
+W0420 13:52:52.717579 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._output_layer_norm.gamma
+W0420 13:52:52.717672 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._output_layer_norm.beta
+W0420 13:52:52.717765 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_output_dense.kernel
+W0420 13:52:52.717857 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_output_dense.bias
+W0420 13:52:52.717952 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer_norm.gamma
+W0420 13:52:52.718047 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer_norm.beta
+W0420 13:52:52.718142 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._intermediate_dense.kernel
+W0420 13:52:52.718239 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._intermediate_dense.bias
+W0420 13:52:52.718341 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._output_dense.kernel
+W0420 13:52:52.718437 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._output_dense.bias
+W0420 13:52:52.718532 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._output_layer_norm.gamma
+W0420 13:52:52.718630 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._output_layer_norm.beta
+W0420 13:52:52.718723 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_output_dense.kernel
+W0420 13:52:52.718816 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_output_dense.bias
+W0420 13:52:52.718909 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer_norm.gamma
+W0420 13:52:52.719002 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer_norm.beta
+W0420 13:52:52.719094 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._intermediate_dense.kernel
+W0420 13:52:52.719187 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._intermediate_dense.bias
+W0420 13:52:52.719280 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._output_dense.kernel
+W0420 13:52:52.719387 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._output_dense.bias
+W0420 13:52:52.719480 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._output_layer_norm.gamma
+W0420 13:52:52.719575 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._output_layer_norm.beta
+W0420 13:52:52.719668 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_output_dense.kernel
+W0420 13:52:52.719762 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_output_dense.bias
+W0420 13:52:52.719854 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer_norm.gamma
+W0420 13:52:52.719947 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer_norm.beta
+W0420 13:52:52.720040 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._intermediate_dense.kernel
+W0420 13:52:52.720137 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._intermediate_dense.bias
+W0420 13:52:52.720242 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._output_dense.kernel
+W0420 13:52:52.720344 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._output_dense.bias
+W0420 13:52:52.720438 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._output_layer_norm.gamma
+W0420 13:52:52.720531 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._output_layer_norm.beta
+W0420 13:52:52.720624 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_output_dense.kernel
+W0420 13:52:52.720716 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_output_dense.bias
+W0420 13:52:52.720809 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer_norm.gamma
+W0420 13:52:52.720900 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer_norm.beta
+W0420 13:52:52.720993 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._intermediate_dense.kernel
+W0420 13:52:52.721086 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._intermediate_dense.bias
+W0420 13:52:52.721179 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._output_dense.kernel
+W0420 13:52:52.721270 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._output_dense.bias
+W0420 13:52:52.721373 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._output_layer_norm.gamma
+W0420 13:52:52.721466 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._output_layer_norm.beta
+W0420 13:52:52.721570 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_output_dense.kernel
+W0420 13:52:52.721662 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_output_dense.bias
+W0420 13:52:52.721756 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer_norm.gamma
+W0420 13:52:52.721848 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer_norm.beta
+W0420 13:52:52.721940 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._intermediate_dense.kernel
+W0420 13:52:52.722033 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._intermediate_dense.bias
+W0420 13:52:52.722125 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._output_dense.kernel
+W0420 13:52:52.722224 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._output_dense.bias
+W0420 13:52:52.722317 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._output_layer_norm.gamma
+W0420 13:52:52.722420 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._output_layer_norm.beta
+W0420 13:52:52.722512 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._query_dense.kernel
+W0420 13:52:52.722606 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._query_dense.bias
+W0420 13:52:52.722702 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._key_dense.kernel
+W0420 13:52:52.722795 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._key_dense.bias
+W0420 13:52:52.722886 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._value_dense.kernel
+W0420 13:52:52.722979 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._value_dense.bias
+W0420 13:52:52.723079 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._query_dense.kernel
+W0420 13:52:52.723172 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._query_dense.bias
+W0420 13:52:52.723264 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._key_dense.kernel
+W0420 13:52:52.723367 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._key_dense.bias
+W0420 13:52:52.723463 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._value_dense.kernel
+W0420 13:52:52.723556 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._value_dense.bias
+W0420 13:52:52.723650 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._query_dense.kernel
+W0420 13:52:52.723743 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._query_dense.bias
+W0420 13:52:52.723836 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._key_dense.kernel
+W0420 13:52:52.723927 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._key_dense.bias
+W0420 13:52:52.724021 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._value_dense.kernel
+W0420 13:52:52.724119 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._value_dense.bias
+W0420 13:52:52.724213 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._query_dense.kernel
+W0420 13:52:52.724306 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._query_dense.bias
+W0420 13:52:52.724410 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._key_dense.kernel
+W0420 13:52:52.724507 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._key_dense.bias
+W0420 13:52:52.724599 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._value_dense.kernel
+W0420 13:52:52.724693 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._value_dense.bias
+W0420 13:52:52.724787 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._query_dense.kernel
+W0420 13:52:52.724881 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._query_dense.bias
+W0420 13:52:52.724973 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._key_dense.kernel
+W0420 13:52:52.725067 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._key_dense.bias
+W0420 13:52:52.725159 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._value_dense.kernel
+W0420 13:52:52.725252 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._value_dense.bias
+W0420 13:52:52.725353 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._query_dense.kernel
+W0420 13:52:52.725448 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._query_dense.bias
+W0420 13:52:52.725540 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._key_dense.kernel
+W0420 13:52:52.725631 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._key_dense.bias
+W0420 13:52:52.725724 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._value_dense.kernel
+W0420 13:52:52.725817 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._value_dense.bias
+W0420 13:52:52.725910 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._query_dense.kernel
+W0420 13:52:52.726015 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._query_dense.bias
+W0420 13:52:52.726109 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._key_dense.kernel
+W0420 13:52:52.726202 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._key_dense.bias
+W0420 13:52:52.726296 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._value_dense.kernel
+W0420 13:52:52.726400 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._value_dense.bias
+W0420 13:52:52.726494 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._query_dense.kernel
+W0420 13:52:52.726587 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._query_dense.bias
+W0420 13:52:52.726680 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._key_dense.kernel
+W0420 13:52:52.726773 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._key_dense.bias
+W0420 13:52:52.726867 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._value_dense.kernel
+W0420 13:52:52.726958 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._value_dense.bias
+W0420 13:52:52.727051 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._query_dense.kernel
+W0420 13:52:52.727145 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._query_dense.bias
+W0420 13:52:52.727238 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._key_dense.kernel
+W0420 13:52:52.727343 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._key_dense.bias
+W0420 13:52:52.727438 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._value_dense.kernel
+W0420 13:52:52.727539 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._value_dense.bias
+W0420 13:52:52.727633 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._query_dense.kernel
+W0420 13:52:52.727727 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._query_dense.bias
+W0420 13:52:52.727824 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._key_dense.kernel
+W0420 13:52:52.727918 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._key_dense.bias
+W0420 13:52:52.728011 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._value_dense.kernel
+W0420 13:52:52.728106 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._value_dense.bias
+W0420 13:52:52.728199 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._query_dense.kernel
+W0420 13:52:52.728290 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._query_dense.bias
+W0420 13:52:52.728394 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._key_dense.kernel
+W0420 13:52:52.728487 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._key_dense.bias
+W0420 13:52:52.728582 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._value_dense.kernel
+W0420 13:52:52.728673 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._value_dense.bias
+W0420 13:52:52.728765 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._query_dense.kernel
+W0420 13:52:52.728858 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._query_dense.bias
+W0420 13:52:52.728956 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._key_dense.kernel
+W0420 13:52:52.729051 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._key_dense.bias
+W0420 13:52:52.729143 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._value_dense.kernel
+W0420 13:52:52.729238 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._value_dense.bias
+W0420 13:52:52.729337 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-0.embeddings
+W0420 13:52:52.729433 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-1.embeddings
+W0420 13:52:52.729525 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-2.embeddings
+W0420 13:52:52.729619 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-3.gamma
+W0420 13:52:52.729710 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-3.beta
+W0420 13:52:52.729810 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-1.layer_with_weights-0.kernel
+W0420 13:52:52.729902 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-1.layer_with_weights-0.bias
+W0420 13:52:52.729994 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_output_dense.kernel
+W0420 13:52:52.730087 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_output_dense.bias
+W0420 13:52:52.730180 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer_norm.gamma
+W0420 13:52:52.730274 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer_norm.beta
+W0420 13:52:52.730376 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._intermediate_dense.kernel
+W0420 13:52:52.730477 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._intermediate_dense.bias
+W0420 13:52:52.730569 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._output_dense.kernel
+W0420 13:52:52.730663 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._output_dense.bias
+W0420 13:52:52.730754 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._output_layer_norm.gamma
+W0420 13:52:52.730848 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._output_layer_norm.beta
+W0420 13:52:52.730940 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_output_dense.kernel
+W0420 13:52:52.731033 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_output_dense.bias
+W0420 13:52:52.731127 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer_norm.gamma
+W0420 13:52:52.731219 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer_norm.beta
+W0420 13:52:52.731313 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._intermediate_dense.kernel
+W0420 13:52:52.731416 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._intermediate_dense.bias
+W0420 13:52:52.731511 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._output_dense.kernel
+W0420 13:52:52.731603 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._output_dense.bias
+W0420 13:52:52.731696 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._output_layer_norm.gamma
+W0420 13:52:52.731793 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._output_layer_norm.beta
+W0420 13:52:52.731898 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_output_dense.kernel
+W0420 13:52:52.731990 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_output_dense.bias
+W0420 13:52:52.732084 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer_norm.gamma
+W0420 13:52:52.732177 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer_norm.beta
+W0420 13:52:52.732269 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._intermediate_dense.kernel
+W0420 13:52:52.732374 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._intermediate_dense.bias
+W0420 13:52:52.732470 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._output_dense.kernel
+W0420 13:52:52.732565 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._output_dense.bias
+W0420 13:52:52.732657 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._output_layer_norm.gamma
+W0420 13:52:52.732754 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._output_layer_norm.beta
+W0420 13:52:52.732851 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_output_dense.kernel
+W0420 13:52:52.732944 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_output_dense.bias
+W0420 13:52:52.733036 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer_norm.gamma
+W0420 13:52:52.733129 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer_norm.beta
+W0420 13:52:52.733221 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._intermediate_dense.kernel
+W0420 13:52:52.733313 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._intermediate_dense.bias
+W0420 13:52:52.733420 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._output_dense.kernel
+W0420 13:52:52.733515 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._output_dense.bias
+W0420 13:52:52.733608 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._output_layer_norm.gamma
+W0420 13:52:52.733706 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._output_layer_norm.beta
+W0420 13:52:52.733800 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_output_dense.kernel
+W0420 13:52:52.733891 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_output_dense.bias
+W0420 13:52:52.733984 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer_norm.gamma
+W0420 13:52:52.734077 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer_norm.beta
+W0420 13:52:52.734171 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._intermediate_dense.kernel
+W0420 13:52:52.734263 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._intermediate_dense.bias
+W0420 13:52:52.734365 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._output_dense.kernel
+W0420 13:52:52.734460 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._output_dense.bias
+W0420 13:52:52.734555 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._output_layer_norm.gamma
+W0420 13:52:52.734646 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._output_layer_norm.beta
+W0420 13:52:52.734740 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_output_dense.kernel
+W0420 13:52:52.734832 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_output_dense.bias
+W0420 13:52:52.734933 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer_norm.gamma
+W0420 13:52:52.735028 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer_norm.beta
+W0420 13:52:52.735119 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._intermediate_dense.kernel
+W0420 13:52:52.735213 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._intermediate_dense.bias
+W0420 13:52:52.735305 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._output_dense.kernel
+W0420 13:52:52.735408 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._output_dense.bias
+W0420 13:52:52.735501 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._output_layer_norm.gamma
+W0420 13:52:52.735594 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._output_layer_norm.beta
+W0420 13:52:52.735693 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_output_dense.kernel
+W0420 13:52:52.735786 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_output_dense.bias
+W0420 13:52:52.735878 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer_norm.gamma
+W0420 13:52:52.735969 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer_norm.beta
+W0420 13:52:52.736063 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._intermediate_dense.kernel
+W0420 13:52:52.736155 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._intermediate_dense.bias
+W0420 13:52:52.736248 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._output_dense.kernel
+W0420 13:52:52.736353 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._output_dense.bias
+W0420 13:52:52.736449 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._output_layer_norm.gamma
+W0420 13:52:52.736541 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._output_layer_norm.beta
+W0420 13:52:52.736634 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_output_dense.kernel
+W0420 13:52:52.736726 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_output_dense.bias
+W0420 13:52:52.736820 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer_norm.gamma
+W0420 13:52:52.736912 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer_norm.beta
+W0420 13:52:52.737005 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._intermediate_dense.kernel
+W0420 13:52:52.737098 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._intermediate_dense.bias
+W0420 13:52:52.737189 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._output_dense.kernel
+W0420 13:52:52.737283 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._output_dense.bias
+W0420 13:52:52.737388 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._output_layer_norm.gamma
+W0420 13:52:52.737483 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._output_layer_norm.beta
+W0420 13:52:52.737575 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_output_dense.kernel
+W0420 13:52:52.737677 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_output_dense.bias
+W0420 13:52:52.737770 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer_norm.gamma
+W0420 13:52:52.737869 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer_norm.beta
+W0420 13:52:52.737961 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._intermediate_dense.kernel
+W0420 13:52:52.738054 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._intermediate_dense.bias
+W0420 13:52:52.738152 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._output_dense.kernel
+W0420 13:52:52.738244 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._output_dense.bias
+W0420 13:52:52.738345 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._output_layer_norm.gamma
+W0420 13:52:52.738439 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._output_layer_norm.beta
+W0420 13:52:52.738533 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_output_dense.kernel
+W0420 13:52:52.738626 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_output_dense.bias
+W0420 13:52:52.738719 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer_norm.gamma
+W0420 13:52:52.738811 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer_norm.beta
+W0420 13:52:52.738904 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._intermediate_dense.kernel
+W0420 13:52:52.738998 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._intermediate_dense.bias
+W0420 13:52:52.739090 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._output_dense.kernel
+W0420 13:52:52.739183 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._output_dense.bias
+W0420 13:52:52.739274 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._output_layer_norm.gamma
+W0420 13:52:52.739382 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._output_layer_norm.beta
+W0420 13:52:52.739475 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_output_dense.kernel
+W0420 13:52:52.739569 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_output_dense.bias
+W0420 13:52:52.739670 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer_norm.gamma
+W0420 13:52:52.739765 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer_norm.beta
+W0420 13:52:52.739858 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._intermediate_dense.kernel
+W0420 13:52:52.739952 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._intermediate_dense.bias
+W0420 13:52:52.740044 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._output_dense.kernel
+W0420 13:52:52.740138 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._output_dense.bias
+W0420 13:52:52.740232 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._output_layer_norm.gamma
+W0420 13:52:52.740325 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._output_layer_norm.beta
+W0420 13:52:52.740429 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_output_dense.kernel
+W0420 13:52:52.740523 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_output_dense.bias
+W0420 13:52:52.740616 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer_norm.gamma
+W0420 13:52:52.740709 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer_norm.beta
+W0420 13:52:52.740808 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._intermediate_dense.kernel
+W0420 13:52:52.740900 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._intermediate_dense.bias
+W0420 13:52:52.740994 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._output_dense.kernel
+W0420 13:52:52.741086 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._output_dense.bias
+W0420 13:52:52.741180 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._output_layer_norm.gamma
+W0420 13:52:52.741272 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._output_layer_norm.beta
+W0420 13:52:52.741376 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._query_dense.kernel
+W0420 13:52:52.741470 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._query_dense.bias
+W0420 13:52:52.741570 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._key_dense.kernel
+W0420 13:52:52.741663 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._key_dense.bias
+W0420 13:52:52.741757 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._value_dense.kernel
+W0420 13:52:52.741851 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-4._attention_layer._value_dense.bias
+W0420 13:52:52.741944 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._query_dense.kernel
+W0420 13:52:52.742038 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._query_dense.bias
+W0420 13:52:52.742130 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._key_dense.kernel
+W0420 13:52:52.742225 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._key_dense.bias
+W0420 13:52:52.742328 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._value_dense.kernel
+W0420 13:52:52.742469 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-5._attention_layer._value_dense.bias
+W0420 13:52:52.742562 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._query_dense.kernel
+W0420 13:52:52.742656 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._query_dense.bias
+W0420 13:52:52.742750 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._key_dense.kernel
+W0420 13:52:52.742849 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._key_dense.bias
+W0420 13:52:52.742942 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._value_dense.kernel
+W0420 13:52:52.743038 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-6._attention_layer._value_dense.bias
+W0420 13:52:52.743132 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._query_dense.kernel
+W0420 13:52:52.743224 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._query_dense.bias
+W0420 13:52:52.743319 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._key_dense.kernel
+W0420 13:52:52.743422 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._key_dense.bias
+W0420 13:52:52.743521 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._value_dense.kernel
+W0420 13:52:52.743614 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-7._attention_layer._value_dense.bias
+W0420 13:52:52.743708 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._query_dense.kernel
+W0420 13:52:52.743805 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._query_dense.bias
+W0420 13:52:52.743901 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._key_dense.kernel
+W0420 13:52:52.743995 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._key_dense.bias
+W0420 13:52:52.744090 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._value_dense.kernel
+W0420 13:52:52.744182 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-8._attention_layer._value_dense.bias
+W0420 13:52:52.744275 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._query_dense.kernel
+W0420 13:52:52.744378 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._query_dense.bias
+W0420 13:52:52.744474 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._key_dense.kernel
+W0420 13:52:52.744567 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._key_dense.bias
+W0420 13:52:52.744659 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._value_dense.kernel
+W0420 13:52:52.744753 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-9._attention_layer._value_dense.bias
+W0420 13:52:52.744845 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._query_dense.kernel
+W0420 13:52:52.744941 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._query_dense.bias
+W0420 13:52:52.745035 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._key_dense.kernel
+W0420 13:52:52.745130 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._key_dense.bias
+W0420 13:52:52.745221 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._value_dense.kernel
+W0420 13:52:52.745319 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-10._attention_layer._value_dense.bias
+W0420 13:52:52.745426 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._query_dense.kernel
+W0420 13:52:52.745520 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._query_dense.bias
+W0420 13:52:52.745615 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._key_dense.kernel
+W0420 13:52:52.745707 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._key_dense.bias
+W0420 13:52:52.745802 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._value_dense.kernel
+W0420 13:52:52.745896 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-11._attention_layer._value_dense.bias
+W0420 13:52:52.745989 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._query_dense.kernel
+W0420 13:52:52.746080 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._query_dense.bias
+W0420 13:52:52.746174 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._key_dense.kernel
+W0420 13:52:52.746266 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._key_dense.bias
+W0420 13:52:52.746369 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._value_dense.kernel
+W0420 13:52:52.746463 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-12._attention_layer._value_dense.bias
+W0420 13:52:52.746556 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._query_dense.kernel
+W0420 13:52:52.746648 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._query_dense.bias
+W0420 13:52:52.746741 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._key_dense.kernel
+W0420 13:52:52.746839 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._key_dense.bias
+W0420 13:52:52.746932 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._value_dense.kernel
+W0420 13:52:52.747025 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-13._attention_layer._value_dense.bias
+W0420 13:52:52.747119 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._query_dense.kernel
+W0420 13:52:52.747211 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._query_dense.bias
+W0420 13:52:52.747310 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._key_dense.kernel
+W0420 13:52:52.747417 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._key_dense.bias
+W0420 13:52:52.747510 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._value_dense.kernel
+W0420 13:52:52.747605 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-14._attention_layer._value_dense.bias
+W0420 13:52:52.747700 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._query_dense.kernel
+W0420 13:52:52.747795 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._query_dense.bias
+W0420 13:52:52.747888 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._key_dense.kernel
+W0420 13:52:52.747983 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._key_dense.bias
+W0420 13:52:52.748075 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._value_dense.kernel
+W0420 13:52:52.748171 47739561595328 util.py:150] Unresolved object in checkpoint: (root).optimizer's state 'v' for (root).model.layer_with_weights-0.layer_with_weights-15._attention_layer._value_dense.bias
+W0420 13:52:52.748269 47739561595328 util.py:158] A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
--- a/dcu1_squad.log
+++ b/dcu1_squad.log
--- a/download_glue_data.py
+++ b/download_glue_data.py
+''' Script for downloading all GLUE data.
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+import os
+import sys
+import shutil
+import argparse
+import tempfile
+import urllib.request
+import zipfile
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+    try:
+        URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
+    dev_ids = []
+    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+    print("\tCompleted!")
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
\ No newline at end of file
--- a/export_tfhub.py
+++ b/export_tfhub.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A script to export the BERT core model as a TF-Hub SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from typing import Text
+from official.nlp.bert import bert_models
+from official.nlp.bert import configs
+FLAGS = flags.FLAGS
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string("model_checkpoint_path", None,
+                    "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_bool("do_lower_case", None, "Whether to lowercase. If None, "
+                  "do_lower_case will be enabled if 'uncased' appears in the "
+                  "name of --vocab_file")
+def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:
+  """Creates a BERT keras core model from BERT configuration.
+  Args:
+    bert_config: A `BertConfig` to create the core model.
+  Returns:
+    A keras model.
+  """
+  # Adds input layers just as placeholders.
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_word_ids")
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_mask")
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_type_ids")
+  transformer_encoder = bert_models.get_transformer_encoder(
+      bert_config, sequence_length=None)
+  sequence_output, pooled_output = transformer_encoder(
+      [input_word_ids, input_mask, input_type_ids])
+  # To keep consistent with legacy hub modules, the outputs are
+  # "pooled_output" and "sequence_output".
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[pooled_output, sequence_output]), transformer_encoder
+def export_bert_tfhub(bert_config: configs.BertConfig,
+                      model_checkpoint_path: Text, hub_destination: Text,
+                      vocab_file: Text, do_lower_case: bool = None):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  # If do_lower_case is not explicit, default to checking whether "uncased" is
+  # in the vocab file name
+  if do_lower_case is None:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+  core_model, encoder = create_bert_model(bert_config)
+  checkpoint = tf.train.Checkpoint(model=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_consumed()
+  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
+def main(_):
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path, FLAGS.export_path,
+                    FLAGS.vocab_file, FLAGS.do_lower_case)
+if __name__ == "__main__":
+  app.run(main)
--- a/export_tfhub_test.py
+++ b/export_tfhub_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests official.nlp.bert.export_tfhub."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+class ExportTfhubTest(tf.test.TestCase):
+  def test_export_tfhub(self):
+    # Exports a savedmodel for TF-Hub
+    hidden_size = 16
+    bert_config = configs.BertConfig(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    bert_model, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    # Restores a hub KerasLayer.
+    hub_layer = hub.KerasLayer(hub_destination, trainable=True)
+    if hasattr(hub_layer, "resolved_object"):
+      # Checks meta attributes.
+      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
+      with tf.io.gfile.GFile(
+          hub_layer.resolved_object.vocab_file.asset_path.numpy()) as f:
+        self.assertEqual("dummy content", f.read())
+    # Checks the hub KerasLayer.
+    for source_weight, hub_weight in zip(bert_model.trainable_weights,
+                                         hub_layer.trainable_weights):
+      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
+    seq_length = 10
+    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
+    hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids])
+    source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids])
+    # The outputs of hub module are "pooled_output" and "sequence_output",
+    # while the outputs of encoder is in reversed order, i.e.,
+    # "sequence_output" and "pooled_output".
+    encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids]))
+    self.assertEqual(hub_outputs[0].shape, (2, hidden_size))
+    self.assertEqual(hub_outputs[1].shape, (2, seq_length, hidden_size))
+    for source_output, hub_output, encoder_output in zip(
+        source_outputs, hub_outputs, encoder_outputs):
+      self.assertAllClose(source_output.numpy(), hub_output.numpy())
+      self.assertAllClose(source_output.numpy(), encoder_output.numpy())
+    # Test that training=True makes a difference (activates dropout).
+    def _dropout_mean_stddev(training, num_runs=20):
+      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
+      inputs = [input_ids, np.ones_like(input_ids), np.zeros_like(input_ids)]
+      outputs = np.concatenate(
+          [hub_layer(inputs, training=training)[0] for _ in range(num_runs)])
+      return np.mean(np.std(outputs, axis=0))
+    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
+    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
+    # Test propagation of seq_length in shape inference.
+    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
+    pooled_output, sequence_output = hub_layer(
+        [input_word_ids, input_mask, input_type_ids])
+    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
+    self.assertEqual(sequence_output.shape.as_list(),
+                     [None, seq_length, hidden_size])
+if __name__ == "__main__":
+  tf.test.main()
--- a/input_pipeline.py
+++ b/input_pipeline.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.io.parse_single_example(record, name_to_features)
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.cast(t, tf.int32)
+    example[name] = t
+  return example
+def single_file_dataset(input_file, name_to_features):
+  """Creates a single-file dataset to be passed for BERT custom training."""
+  # For training, we want a lot of parallel reading and shuffling.
+  # For eval, we want no shuffling and parallel reading doesn't matter.
+  d = tf.data.TFRecordDataset(input_file)
+  d = d.map(
+      lambda record: decode_record(record, name_to_features),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  # When `input_file` is a path to a single file or a list
+  # containing a single path, disable auto sharding so that
+  # same input file is sent to all workers.
+  if isinstance(input_file, str) or len(input_file) == 1:
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = (
+        tf.data.experimental.AutoShardPolicy.OFF)
+    d = d.with_options(options)
+  return d
+def create_pretrain_dataset(input_patterns,
+                            seq_length,
+                            max_predictions_per_seq,
+                            batch_size,
+                            is_training=True,
+                            input_pipeline_context=None,
+                            use_next_sentence_label=True,
+                            use_position_id=False,
+                            output_fake_labels=True):
+  """Creates input dataset from (tf)records files for pretraining."""
+  name_to_features = {
+      'input_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'masked_lm_positions':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_ids':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_weights':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+  }
+  if use_next_sentence_label:
+    name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                     tf.int64)
+  if use_position_id:
+    name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length],
+                                                             tf.int64)
+  for input_pattern in input_patterns:
+    if not tf.io.gfile.glob(input_pattern):
+      raise ValueError('%s does not match any files.' % input_pattern)
+  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset = dataset.repeat()
+    # We set shuffle buffer to exactly match total number of
+    # training files to ensure that training data is well shuffled.
+    input_files = []
+    for input_pattern in input_patterns:
+      input_files.extend(tf.io.gfile.glob(input_pattern))
+    dataset = dataset.shuffle(len(input_files))
+  # In parallel, create tf record dataset for each train files.
+  # cycle_length = 8 means that up to 8 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=8,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  if is_training:
+    dataset = dataset.shuffle(100)
+  decode_fn = lambda record: decode_record(record, name_to_features)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  def _select_data_from_record(record):
+    """Filter out features to use for pretraining."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids'],
+        'masked_lm_positions': record['masked_lm_positions'],
+        'masked_lm_ids': record['masked_lm_ids'],
+        'masked_lm_weights': record['masked_lm_weights'],
+    }
+    if use_next_sentence_label:
+      x['next_sentence_labels'] = record['next_sentence_labels']
+    if use_position_id:
+      x['position_ids'] = record['position_ids']
+    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
+    if output_fake_labels:
+      return (x, record['masked_lm_weights'])
+    else:
+      return x
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+def create_classifier_dataset(file_path,
+                              seq_length,
+                              batch_size,
+                              is_training=True,
+                              input_pipeline_context=None,
+                              label_type=tf.int64,
+                              include_sample_weights=False):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'label_ids': tf.io.FixedLenFeature([], label_type),
+  }
+  if include_sample_weights:
+    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
+  dataset = single_file_dataset(file_path, name_to_features)
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    if include_sample_weights:
+      w = record['weight']
+      return (x, y, w)
+    return (x, y)
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+def create_squad_dataset(file_path,
+                         seq_length,
+                         batch_size,
+                         is_training=True,
+                         input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+  }
+  if is_training:
+    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+  else:
+    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+  dataset = single_file_dataset(file_path, name_to_features)
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  def _select_data_from_record(record):
+    """Dispatches record to features and labels."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=True)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+def create_retrieval_dataset(file_path,
+                             seq_length,
+                             batch_size,
+                             input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for scoring."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'int_iden': tf.io.FixedLenFeature([1], tf.int64),
+  }
+  dataset = single_file_dataset(file_path, name_to_features)
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['int_iden']
+    return (x, y)
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=False)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
--- a/model_saving_utils.py
+++ b/model_saving_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to save models."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import os
+from absl import logging
+import tensorflow as tf
+import typing
+def export_bert_model(model_export_path: typing.Text,
+                      model: tf.keras.Model,
+                      checkpoint_dir: typing.Optional[typing.Text] = None,
+                      restore_model_using_load_weights: bool = False) -> None:
+  """Export BERT model for serving which does not include the optimizer.
+  Arguments:
+      model_export_path: Path to which exported model will be saved.
+      model: Keras model object to export.
+      checkpoint_dir: Path from which model weights will be loaded, if
+        specified.
+      restore_model_using_load_weights: Whether to use checkpoint.restore() API
+        for custom checkpoint or to use model.load_weights() API.
+        There are 2 different ways to save checkpoints. One is using
+        tf.train.Checkpoint and another is using Keras model.save_weights().
+        Custom training loop implementation uses tf.train.Checkpoint API
+        and Keras ModelCheckpoint callback internally uses model.save_weights()
+        API. Since these two API's cannot be used toghether, model loading logic
+        must be take into account how model checkpoint was saved.
+  Raises:
+    ValueError when either model_export_path or model is not specified.
+  """
+  if not model_export_path:
+    raise ValueError('model_export_path must be specified.')
+  if not isinstance(model, tf.keras.Model):
+    raise ValueError('model must be a tf.keras.Model object.')
+  if checkpoint_dir:
+    # Keras compile/fit() was used to save checkpoint using
+    # model.save_weights().
+    if restore_model_using_load_weights:
+      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
+      assert tf.io.gfile.exists(model_weight_path)
+      model.load_weights(model_weight_path)
+    # tf.train.Checkpoint API was used via custom training loop logic.
+    else:
+      checkpoint = tf.train.Checkpoint(model=model)
+      # Restores the model from latest checkpoint.
+      latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+  model.save(model_export_path, include_optimizer=False, save_format='tf')
--- a/model_training_utils.py
+++ b/model_training_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A light weight utilities to train NLP models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import os
+import tempfile
+from absl import logging
+import tensorflow as tf
+from tensorflow.python.util import deprecation
+from official.staging.training import grad_utils
+from official.utils.misc import distribution_utils
+_SUMMARY_TXT = 'training_summary.txt'
+_MIN_SUMMARY_STEPS = 10
+def _should_export_checkpoint(strategy):
+  return (not strategy) or strategy.extended.should_checkpoint
+def _should_export_summary(strategy):
+  return (not strategy) or strategy.extended.should_save_summary
+def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to with provided checkpoint prefix."""
+  if _should_export_checkpoint(strategy):
+    checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+    saved_path = checkpoint.save(checkpoint_path)
+    logging.info('Saving model as TF checkpoint: %s', saved_path)
+  else:
+    # In multi worker training we need every worker to save checkpoint, because
+    # variables can trigger synchronization on read and synchronization needs
+    # all workers to participate. To avoid workers overriding each other we save
+    # to a temporary directory on non-chief workers.
+    tmp_dir = tempfile.mkdtemp()
+    checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
+    tf.io.gfile.rmtree(tmp_dir)
+  return
+def _get_input_iterator(input_fn, strategy):
+  """Returns distributed dataset iterator."""
+  # When training with TPU pods, datasets needs to be cloned across
+  # workers. Since Dataset instance cannot be cloned in eager mode, we instead
+  # pass callable that returns a dataset.
+  if not callable(input_fn):
+    raise ValueError('`input_fn` should be a closure that returns a dataset.')
+  iterator = iter(
+      strategy.experimental_distribute_datasets_from_function(input_fn))
+  return iterator
+def _float_metric_value(metric):
+  """Gets the value of a float-value keras metric."""
+  return metric.result().numpy().astype(float)
+def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  remainder_in_epoch = current_step % steps_per_epoch
+  if remainder_in_epoch != 0:
+    return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
+  else:
+    return steps_per_loop
+def write_txt_summary(training_summary, summary_dir):
+  """Writes a summary text file to record stats."""
+  if not tf.io.gfile.exists(summary_dir):
+    tf.io.gfile.mkdir(summary_dir)
+  summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
+  with tf.io.gfile.GFile(summary_path, 'wb') as f:
+    logging.info('Training Summary: \n%s', str(training_summary))
+    f.write(json.dumps(training_summary, indent=4))
+@deprecation.deprecated(
+    None, 'This function is deprecated. Please use Keras compile/fit instead.')
+def run_customized_training_loop(
+    # pylint: disable=invalid-name
+    _sentinel=None,
+    # pylint: enable=invalid-name
+    strategy=None,
+    model_fn=None,
+    loss_fn=None,
+    scale_loss=True,
+    model_dir=None,
+    train_input_fn=None,
+    steps_per_epoch=None,
+    num_eval_per_epoch=1,
+    steps_per_loop=None,
+    epochs=1,
+    eval_input_fn=None,
+    eval_steps=None,
+    metric_fn=None,
+    init_checkpoint=None,
+    custom_callbacks=None,
+    run_eagerly=False,
+    sub_model_export_name=None,
+    explicit_allreduce=False,
+    pre_allreduce_callbacks=None,
+    post_allreduce_callbacks=None,
+    train_summary_interval=0):
+  """Run BERT pretrain model training using low-level API.
+  Arguments:
+      _sentinel: Used to prevent positional parameters. Internal, do not use.
+      strategy: Distribution strategy on which to run low level training loop.
+      model_fn: Function that returns a tuple (model, sub_model). Caller of this
+        function should add optimizer to the `model` via calling
+        `model.compile()` API or manually setting `model.optimizer` attribute.
+        Second element of the returned tuple(sub_model) is an optional sub model
+        to be used for initial checkpoint -- if provided.
+      loss_fn: Function with signature func(labels, logits) and returns a loss
+        tensor.
+      scale_loss: Whether to divide the raw loss by number of replicas before
+        gradients calculation.
+      model_dir: Model directory used during training for restoring/saving model
+        weights.
+      train_input_fn: Function that returns a tf.data.Dataset used for training.
+      steps_per_epoch: Number of steps to run per epoch. At the end of each
+        epoch, model checkpoint will be saved and evaluation will be conducted
+        if evaluation dataset is provided.
+      num_eval_per_epoch: Number of evaluations per epoch.
+      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
+        communication in eager context, training logs are printed every
+        steps_per_loop.
+      epochs: Number of epochs to train.
+      eval_input_fn: Function that returns evaluation dataset. If none,
+        evaluation is skipped.
+      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
+        is not none.
+      metric_fn: A metrics function that returns a Keras Metric object to record
+        evaluation result using evaluation dataset or with training dataset
+        after every epoch.
+      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
+        `model_fn`.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_train_begin(), on_train_end(),
+        on_batch_begin()`, `on_batch_end()`, `on_epoch_begin()`,
+        `on_epoch_end()` methods are invoked during training.
+        Note that some metrics may be missing from `logs`.
+      run_eagerly: Whether to run model training in pure eager execution. This
+        should be disable for TPUStrategy.
+      sub_model_export_name: If not None, will export `sub_model` returned by
+        `model_fn` into checkpoint files. The name of intermediate checkpoint
+        file is {sub_model_export_name}_step_{step}.ckpt and the last
+        checkpint's name is {sub_model_export_name}.ckpt; if None, `sub_model`
+        will not be exported as checkpoint.
+      explicit_allreduce: Whether to explicitly perform gradient allreduce,
+        instead of relying on implicit allreduce in optimizer.apply_gradients().
+        default is False. For now, if training using FP16 mixed precision,
+        explicit allreduce will aggregate gradients in FP16 format. For TPU and
+        GPU training using FP32, explicit allreduce will aggregate gradients in
+        FP32 format.
+      pre_allreduce_callbacks: A list of callback functions that takes gradients
+        and model variables pairs as input, manipulate them, and returns a new
+        gradients and model variables paris. The callback functions will be
+        invoked in the list order and before gradients are allreduced. With
+        mixed precision training, the pre_allreduce_allbacks will be applied on
+        scaled_gradients. Default is no callbacks. Only used when
+        explicit_allreduce=True.
+      post_allreduce_callbacks: A list of callback functions that takes
+        gradients and model variables pairs as input, manipulate them, and
+        returns a new gradients and model variables paris. The callback
+        functions will be invoked in the list order and right before gradients
+        are applied to variables for updates. Default is no callbacks. Only used
+        when explicit_allreduce=True.
+      train_summary_interval: Step interval for training summaries. If the value
+        is a negative number, then training summaries are not enabled.
+  Returns:
+      Trained model.
+  Raises:
+      ValueError: (1) When model returned by `model_fn` does not have optimizer
+        attribute or when required parameters are set to none. (2) eval args are
+        not specified correctly. (3) metric_fn must be a callable if specified.
+        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
+        by `model_fn` is None.
+  """
+  if _sentinel is not None:
+    raise ValueError('only call `run_customized_training_loop()` '
+                     'with named arguments.')
+  required_arguments = [
+      strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
+  ]
+  steps_between_evals = int(steps_per_epoch / num_eval_per_epoch)
+  if [arg for arg in required_arguments if arg is None]:
+    raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
+                     '`steps_per_epoch` and `train_input_fn` are required '
+                     'parameters.')
+  if not steps_per_loop:
+    if tf.config.list_logical_devices('TPU'):
+      # One can't fully utilize a TPU with steps_per_loop=1, so in this case
+      # default users to a more useful value.
+      steps_per_loop = min(1000, steps_between_evals)
+    else:
+      steps_per_loop = 1
+    logging.info('steps_per_loop not specified. Using steps_per_loop=%d',
+                 steps_per_loop)
+  if steps_per_loop > steps_between_evals:
+    logging.warning(
+        'steps_per_loop: %d is specified to be greater than '
+        ' steps_between_evals: %d, we will use steps_between_evals as'
+        ' steps_per_loop.', steps_per_loop, steps_between_evals)
+    steps_per_loop = steps_between_evals
+  assert tf.executing_eagerly()
+  if run_eagerly:
+    if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
+      raise ValueError(
+          'TPUStrategy should not run eagerly as it heavily relies on graph'
+          ' optimization for the distributed system.')
+  if eval_input_fn and eval_steps is None:
+    raise ValueError(
+        '`eval_step` is required when `eval_input_fn ` is not none.')
+  if metric_fn and not callable(metric_fn):
+    raise ValueError(
+        'if `metric_fn` is specified, metric_fn must be a callable.')
+  total_training_steps = steps_per_epoch * epochs
+  train_iterator = _get_input_iterator(train_input_fn, strategy)
+  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+  with distribution_utils.get_strategy_scope(strategy):
+    # To correctly place the model weights on accelerators,
+    # model and optimizer should be created in scope.
+    model, sub_model = model_fn()
+    if not hasattr(model, 'optimizer'):
+      raise ValueError('User should set optimizer attribute to model '
+                       'inside `model_fn`.')
+    if sub_model_export_name and sub_model is None:
+      raise ValueError('sub_model_export_name is specified as %s, but '
+                       'sub_model is None.' % sub_model_export_name)
+    callback_list = tf.keras.callbacks.CallbackList(
+        callbacks=custom_callbacks, model=model)
+    optimizer = model.optimizer
+    if init_checkpoint:
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'initial checkpoint for core model.', init_checkpoint)
+      checkpoint = tf.train.Checkpoint(model=sub_model)
+      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+      logging.info('Loading from checkpoint file completed')
+    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+    eval_metrics = [metric_fn()] if metric_fn else []
+    # If evaluation is required, make a copy of metric as it will be used by
+    # both train and evaluation.
+    train_metrics = [
+        metric.__class__.from_config(metric.get_config())
+        for metric in eval_metrics
+    ]
+    # Create summary writers
+    if _should_export_summary(strategy):
+      summary_dir = os.path.join(model_dir, 'summaries')
+    else:
+      # In multi worker training we need every worker to write summary, because
+      # variables can trigger synchronization on read and synchronization needs
+      # all workers to participate.
+      summary_dir = tempfile.mkdtemp()
+    eval_summary_writer = tf.summary.create_file_writer(
+        os.path.join(summary_dir, 'eval'))
+    last_summary_step = 0
+    if steps_per_loop >= _MIN_SUMMARY_STEPS and train_summary_interval >= 0:
+      # Only writes summary when the stats are collected sufficiently over
+      # enough steps.
+      train_summary_writer = tf.summary.create_file_writer(
+          os.path.join(summary_dir, 'train'))
+    else:
+      train_summary_writer = tf.summary.create_noop_writer()
+    # Collects training variables.
+    training_vars = model.trainable_variables
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, labels = inputs
+      with tf.GradientTape() as tape:
+        model_outputs = model(inputs, training=True)
+        loss = loss_fn(labels, model_outputs)
+        # Raw loss is used for reporting in metrics/logs.
+        raw_loss = loss
+        if scale_loss:
+          # Scales down the loss for gradients to be invariant from replicas.
+          loss = loss / strategy.num_replicas_in_sync
+      if explicit_allreduce:
+        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
+                                                     training_vars,
+                                                     pre_allreduce_callbacks,
+                                                     post_allreduce_callbacks)
+      else:
+        if isinstance(optimizer,
+                      tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+          with tape:
+            scaled_loss = optimizer.get_scaled_loss(loss)
+          scaled_grads = tape.gradient(scaled_loss, training_vars)
+          grads = optimizer.get_unscaled_gradients(scaled_grads)
+        else:
+          grads = tape.gradient(loss, training_vars)
+        optimizer.apply_gradients(zip(grads, training_vars))
+      # For reporting, the metric takes the mean of losses.
+      train_loss_metric.update_state(raw_loss)
+      for metric in train_metrics:
+        metric.update_state(labels, model_outputs)
+    @tf.function
+    def train_steps(iterator, steps):
+      """Performs distributed training steps in a loop.
+      Args:
+        iterator: the distributed iterator of training datasets.
+        steps: an tf.int32 integer tensor to specify number of steps to run
+          inside host training loop.
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      if not isinstance(steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+      for _ in tf.range(steps):
+        strategy.run(_replicated_step, args=(next(iterator),))
+    def train_single_step(iterator):
+      """Performs a distributed training step.
+      Args:
+        iterator: the distributed iterator of training datasets.
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      strategy.run(_replicated_step, args=(next(iterator),))
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        for metric in eval_metrics:
+          metric.update_state(labels, model_outputs)
+        return model_outputs, labels
+      outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
+      outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                      outputs)
+      labels = tf.nest.map_structure(strategy.experimental_local_results,
+                                     labels)
+      return outputs, labels
+    if not run_eagerly:
+      train_single_step = tf.function(train_single_step)
+      test_step = tf.function(test_step)
+    def _run_evaluation(current_training_step, test_iterator):
+      """Runs validation steps and aggregate metrics.
+      Args:
+        current_training_step: tf.int32 tensor containing the current step.
+        test_iterator: distributed iterator of test datasets.
+      Returns:
+        A dict of metic names and values.
+      """
+      # The last batch of the evaluation is often smaller than previous ones.
+      # Moreover, in some distributed pieces it might even be empty. Therefore,
+      # different from the way training_loss is calculated, it is needed to
+      # gather all the logits and labels here to calculate the evaluation loss
+      # outside.
+      loss_list, loss_weights = list(), list()
+      for _ in range(eval_steps):
+        outputs, labels = test_step(test_iterator)
+        for cur_logits, cur_labels in zip(outputs, labels):
+          # This is to handle cases when cur_labels is not a single tensor,
+          # but a dict of tensors.
+          cur_weight = tf.shape(tf.nest.flatten(cur_labels)[0])[0]
+          if cur_weight != 0:
+            loss_list.append(loss_fn(cur_labels, cur_logits).numpy())
+            loss_weights.append(cur_weight)
+      # The sample_weights are the actual number of examples in each batch,
+      # a summation of numbers of examples in each replica if using
+      # distributed training.
+      eval_loss_metric.update_state(loss_list, sample_weight=loss_weights)
+      logs = {}
+      with eval_summary_writer.as_default():
+        for metric in [eval_loss_metric] + eval_metrics + model.metrics:
+          metric_value = _float_metric_value(metric)
+          logs[metric.name] = metric_value
+          logging.info('Step: [%d] Validation %s = %f', current_training_step,
+                       metric.name, metric_value)
+          tf.summary.scalar(
+              metric.name, metric_value, step=current_training_step)
+        eval_summary_writer.flush()
+      return logs
+    # Training loop starts here.
+    checkpoint = tf.train.Checkpoint(
+        model=model, optimizer=optimizer, global_step=optimizer.iterations)
+    sub_model_checkpoint = tf.train.Checkpoint(
+        model=sub_model,
+        global_step=optimizer.iterations) if sub_model_export_name else None
+    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+    if latest_checkpoint_file:
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(latest_checkpoint_file)
+      logging.info('Loading from checkpoint file completed')
+    current_step = optimizer.iterations.numpy()
+    checkpoint_name = 'ctl_step_{step}.ckpt'
+    logs = {}
+    callback_list.on_train_begin()
+    while current_step < total_training_steps and not model.stop_training:
+      if current_step % steps_per_epoch == 0:
+        callback_list.on_epoch_begin(
+            int(current_step / steps_per_epoch) + 1)
+      # Training loss/metric are taking average over steps inside micro
+      # training loop. We reset the their values before each round.
+      train_loss_metric.reset_states()
+      for metric in train_metrics + model.metrics:
+        metric.reset_states()
+      callback_list.on_batch_begin(current_step)
+      # Runs several steps in the host while loop.
+      steps = steps_to_run(current_step, steps_between_evals, steps_per_loop)
+      if tf.config.list_physical_devices('GPU'):
+        # TODO(zongweiz): merge with train_steps once tf.while_loop
+        # GPU performance bugs are fixed.
+        for _ in range(steps):
+          train_single_step(train_iterator)
+      else:
+        # Converts steps to a Tensor to avoid tf.function retracing.
+        train_steps(train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32))
+      train_loss = _float_metric_value(train_loss_metric)
+      current_step += steps
+      # Updates training logging.
+      training_status = 'Train Step: %d/%d  / loss = %s' % (
+          current_step, total_training_steps, train_loss)
+      if current_step >= last_summary_step + train_summary_interval:
+        summary_writer = train_summary_writer
+        last_summary_step = current_step
+      else:
+        summary_writer = tf.summary.create_noop_writer()
+      with summary_writer.as_default():
+        if callable(optimizer.learning_rate):
+          tf.summary.scalar(
+              'learning_rate',
+              optimizer.learning_rate(current_step),
+              step=current_step)
+        tf.summary.scalar(train_loss_metric.name, train_loss, step=current_step)
+        for metric in train_metrics + model.metrics:
+          metric_value = _float_metric_value(metric)
+          training_status += '  %s = %f' % (metric.name, metric_value)
+          tf.summary.scalar(metric.name, metric_value, step=current_step)
+        summary_writer.flush()
+      logging.info(training_status)
+      # If no need for evaluation, we only call on_batch_end with train_loss,
+      # this is to ensure we get granular global_step/sec on Tensorboard.
+      if current_step % steps_between_evals:
+        callback_list.on_batch_end(current_step - 1, {'loss': train_loss})
+      else:
+        # Save a submodel with the step in the file name after each epoch.
+        if sub_model_export_name:
+          _save_checkpoint(
+              strategy, sub_model_checkpoint, model_dir,
+              '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
+        # Save model checkpoints and run validation steps after each epoch
+        # (with the exception of the final epoch which is handled after the
+        # training loop).
+        if current_step < total_training_steps:
+          _save_checkpoint(strategy, checkpoint, model_dir,
+                           checkpoint_name.format(step=current_step))
+          if eval_input_fn:
+            logging.info('Running evaluation after step: %s.', current_step)
+            logs = _run_evaluation(current_step,
+                                   _get_input_iterator(eval_input_fn, strategy))
+            # Re-initialize evaluation metric.
+            eval_loss_metric.reset_states()
+            for metric in eval_metrics + model.metrics:
+              metric.reset_states()
+        # We add train_loss here rather than call on_batch_end twice to make
+        # sure that no duplicated values are generated.
+        logs['loss'] = train_loss
+        callback_list.on_batch_end(current_step - 1, logs)
+      # Calls on_epoch_end after each real epoch ends to prevent mis-calculation
+      # of training steps.
+      if current_step % steps_per_epoch == 0:
+        callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+    if sub_model_export_name:
+      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
+                       '%s.ckpt' % sub_model_export_name)
+    _save_checkpoint(strategy, checkpoint, model_dir,
+                     checkpoint_name.format(step=current_step))
+    if eval_input_fn:
+      logging.info('Running final evaluation after training is complete.')
+      logs = _run_evaluation(current_step,
+                             _get_input_iterator(eval_input_fn, strategy))
+    callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+    training_summary = {
+        'total_training_steps': total_training_steps,
+        'train_loss': _float_metric_value(train_loss_metric),
+    }
+    for metric in model.metrics:
+      training_summary[metric.name] = _float_metric_value(metric)
+    if eval_metrics:
+      # TODO(hongkuny): Cleans up summary reporting in text.
+      training_summary['last_train_metrics'] = _float_metric_value(
+          train_metrics[0])
+      training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
+    write_txt_summary(training_summary, summary_dir)
+    if not _should_export_summary(strategy):
+      tf.io.gfile.rmtree(summary_dir)
+    callback_list.on_train_end()
+    return model
--- a/model_training_utils_test.py
+++ b/model_training_utils_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.modeling.training.model_training_utils."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import logging
+from absl.testing import parameterized
+from absl.testing.absltest import mock
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.nlp.bert import model_training_utils
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+def eager_gpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+def create_fake_data_input_fn(batch_size, features_shape, num_classes):
+  """Creates a dummy input function with the given feature and label shapes.
+  Args:
+    batch_size: integer.
+    features_shape: list[int]. Feature shape for an individual example.
+    num_classes: integer. Number of labels.
+  Returns:
+    An input function that is usable in the executor.
+  """
+  def _dataset_fn(input_context=None):
+    """An input function for generating fake data."""
+    local_batch_size = input_context.get_per_replica_batch_size(batch_size)
+    features = np.random.rand(64, *features_shape)
+    labels = np.random.randint(2, size=[64, num_classes])
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+    def _assign_dtype(features, labels):
+      features = tf.cast(features, tf.float32)
+      labels = tf.cast(labels, tf.float32)
+      return features, labels
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.map(_assign_dtype)
+    dataset = dataset.shuffle(64).repeat()
+    dataset = dataset.batch(local_batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(buffer_size=64)
+    return dataset
+  return _dataset_fn
+def create_model_fn(input_shape, num_classes, use_float16=False):
+  def _model_fn():
+    """A one-layer softmax model suitable for testing."""
+    input_layer = tf.keras.layers.Input(shape=input_shape)
+    x = tf.keras.layers.Dense(num_classes, activation='relu')(input_layer)
+    output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
+    sub_model = tf.keras.models.Model(input_layer, x, name='sub_model')
+    model = tf.keras.models.Model(input_layer, output_layer, name='model')
+    model.add_metric(
+        tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
+    model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
+    if use_float16:
+      model.optimizer = (
+          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+              model.optimizer, loss_scale='dynamic'))
+    return model, sub_model
+  return _model_fn
+def metric_fn():
+  """Gets a tf.keras metric object."""
+  return tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32)
+def summaries_with_matching_keyword(keyword, summary_dir):
+  """Yields summary protos matching given keyword from event file."""
+  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, 'events*'))
+  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
+        if keyword in value.tag:
+          logging.error(event)
+          yield event.summary
+def check_eventfile_for_keyword(keyword, summary_dir):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, summary_dir))
+class RecordingCallback(tf.keras.callbacks.Callback):
+  def __init__(self):
+    self.batch_begin = []  # (batch, logs)
+    self.batch_end = []    # (batch, logs)
+    self.epoch_begin = []  # (epoch, logs)
+    self.epoch_end = []    # (epoch, logs)
+  def on_batch_begin(self, batch, logs=None):
+    self.batch_begin.append((batch, logs))
+  def on_batch_end(self, batch, logs=None):
+    self.batch_end.append((batch, logs))
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_begin.append((epoch, logs))
+  def on_epoch_end(self, epoch, logs=None):
+    self.epoch_end.append((epoch, logs))
+class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(ModelTrainingUtilsTest, self).setUp()
+    self._model_fn = create_model_fn(input_shape=[128], num_classes=3)
+  def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
+    input_fn = create_fake_data_input_fn(
+        batch_size=8, features_shape=[128], num_classes=3)
+    model_training_utils.run_customized_training_loop(
+        strategy=strategy,
+        model_fn=self._model_fn,
+        loss_fn=tf.keras.losses.categorical_crossentropy,
+        model_dir=model_dir,
+        steps_per_epoch=20,
+        steps_per_loop=steps_per_loop,
+        epochs=2,
+        train_input_fn=input_fn,
+        eval_input_fn=input_fn,
+        eval_steps=10,
+        init_checkpoint=None,
+        sub_model_export_name='my_submodel_name',
+        metric_fn=metric_fn,
+        custom_callbacks=None,
+        run_eagerly=run_eagerly)
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_eager_single_step(self, distribution):
+    model_dir = self.get_temp_dir()
+    if isinstance(distribution, tf.distribute.experimental.TPUStrategy):
+      with self.assertRaises(ValueError):
+        self.run_training(
+            distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+    else:
+      self.run_training(
+          distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+  @combinations.generate(eager_gpu_strategy_combinations())
+  def test_train_eager_mixed_precision(self, distribution):
+    model_dir = self.get_temp_dir()
+    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    self._model_fn = create_model_fn(
+        input_shape=[128], num_classes=3, use_float16=True)
+    self.run_training(
+        distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_check_artifacts(self, distribution):
+    model_dir = self.get_temp_dir()
+    self.run_training(
+        distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+    # Two checkpoints should be saved after two epochs.
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*index')))
+    self.assertCountEqual(['ctl_step_20.ckpt-1.index',
+                           'ctl_step_40.ckpt-2.index'], files)
+    # Three submodel checkpoints should be saved after two epochs (one after
+    # each epoch plus one final).
+    files = map(os.path.basename,
+                tf.io.gfile.glob(os.path.join(model_dir,
+                                              'my_submodel_name*index')))
+    self.assertCountEqual(['my_submodel_name.ckpt-3.index',
+                           'my_submodel_name_step_20.ckpt-1.index',
+                           'my_submodel_name_step_40.ckpt-2.index'], files)
+    self.assertNotEmpty(
+        tf.io.gfile.glob(
+            os.path.join(model_dir, 'summaries/training_summary*')))
+    # Loss and accuracy values should be written into summaries.
+    self.assertTrue(
+        check_eventfile_for_keyword('loss',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/eval')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/eval')))
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_check_callbacks(self, distribution):
+    model_dir = self.get_temp_dir()
+    callback = RecordingCallback()
+    callbacks = [callback]
+    input_fn = create_fake_data_input_fn(
+        batch_size=8, features_shape=[128], num_classes=3)
+    model_training_utils.run_customized_training_loop(
+        strategy=distribution,
+        model_fn=self._model_fn,
+        loss_fn=tf.keras.losses.categorical_crossentropy,
+        model_dir=model_dir,
+        steps_per_epoch=20,
+        num_eval_per_epoch=4,
+        steps_per_loop=10,
+        epochs=2,
+        train_input_fn=input_fn,
+        eval_input_fn=input_fn,
+        eval_steps=10,
+        init_checkpoint=None,
+        metric_fn=metric_fn,
+        custom_callbacks=callbacks,
+        run_eagerly=False)
+    self.assertEqual(callback.epoch_begin, [(1, {}), (2, {})])
+    epoch_ends, epoch_end_infos = zip(*callback.epoch_end)
+    self.assertEqual(list(epoch_ends), [1, 2, 2])
+    for info in epoch_end_infos:
+      self.assertIn('accuracy', info)
+    self.assertEqual(callback.batch_begin, [(0, {}), (5, {}), (10, {}),
+                                            (15, {}), (20, {}), (25, {}),
+                                            (30, {}), (35, {})])
+    batch_ends, batch_end_infos = zip(*callback.batch_end)
+    self.assertEqual(list(batch_ends), [4, 9, 14, 19, 24, 29, 34, 39])
+    for info in batch_end_infos:
+      self.assertIn('loss', info)
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+      ))
+  def test_train_check_artifacts_non_chief(self, distribution):
+    # We shouldn't export artifacts on non-chief workers. Since there's no easy
+    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
+    # to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
+    extended = distribution.extended
+    with mock.patch.object(extended.__class__, 'should_checkpoint',
+                           new_callable=mock.PropertyMock, return_value=False), \
+         mock.patch.object(extended.__class__, 'should_save_summary',
+                           new_callable=mock.PropertyMock, return_value=False):
+      model_dir = self.get_temp_dir()
+      self.run_training(
+          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+      self.assertEmpty(tf.io.gfile.listdir(model_dir))
+if __name__ == '__main__':
+  tf.test.main()
--- a/run.sh
+++ b/run.sh
+#export GLUE_DIR=./glue
+#export BERT_DIR=./bert_en_cased_L-12_H-768_A-12_1/assets
+#export TASK_NAME=MNLI
+#export OUTPUT_DIR=./glue_finetuning
+#python ../data/create_finetuning_data.py \
+# --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+# --vocab_file=${BERT_DIR}/vocab.txt \
+# --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+# --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+# --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+# --fine_tuning_task_type=classification --max_seq_length=128 \
+# --classification_task_name=${TASK_NAME}
+#
+export BERT_DIR=./bert_en_cased_L-12_H-768_A-12_1/assets
+export MODEL_DIR=./model_dir
+export GLUE_DIR=./glue_finetuning
+export TASK=MNLI
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --num_gpus=4 \
+  --distribution_strategy=multi_worker_mirrored
+#  --dtype=fp16 \
+#  --fp16_implementation=graph_rewrite \
+#  --distribution_strategy=mirrored /MultiWorkerMirroredStrategy
+  #--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  #--enable_xla=true \
--- a/run_classifier.py
+++ b/run_classifier.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT classification or regression finetuning runner in TF 2.x."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import json
+import math
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+import tensorflow as tf
+import sys
+sys.path.append("/public/home/xuanbaby/DL-TensorFlow/models_r2.3.0")
+from official.modeling import performance
+from official.nlp import optimization
+from official.nlp.bert import bert_models
+from official.nlp.bert import common_flags
+from official.nlp.bert import configs as bert_configs
+from official.nlp.bert import input_pipeline
+from official.nlp.bert import model_saving_utils
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+flags.DEFINE_enum(
+    'mode', 'train_and_eval', ['train_and_eval', 'export_only', 'predict'],
+    'One of {"train_and_eval", "export_only", "predict"}. `train_and_eval`: '
+    'trains the model and evaluates in the meantime. '
+    '`export_only`: will take the latest checkpoint inside '
+    'model_dir and export a `SavedModel`. `predict`: takes a checkpoint and '
+    'restores the model to output predictions on the test set.')
+flags.DEFINE_string('train_data_path', None,
+                    'Path to training data for BERT classifier.')
+flags.DEFINE_string('eval_data_path', None,
+                    'Path to evaluation data for BERT classifier.')
+flags.DEFINE_string(
+    'input_meta_data_path', None,
+    'Path to file that contains meta data about input '
+    'to be used for training and evaluation.')
+flags.DEFINE_string('predict_checkpoint_path', None,
+                    'Path to the checkpoint for predictions.')
+flags.DEFINE_integer(
+    'num_eval_per_epoch', 1,
+    'Number of evaluations per epoch. The purpose of this flag is to provide '
+    'more granular evaluation scores and checkpoints. For example, if original '
+    'data has N samples and num_eval_per_epoch is n, then each epoch will be '
+    'evaluated every N/n samples.')
+flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
+flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
+common_flags.define_common_bert_flags()
+FLAGS = flags.FLAGS
+LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
+def get_loss_fn(num_classes):
+  """Gets the classification loss function."""
+  def classification_loss_fn(labels, logits):
+    """Classification loss."""
+    labels = tf.squeeze(labels)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    one_hot_labels = tf.one_hot(
+        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(
+        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
+    return tf.reduce_mean(per_example_loss)
+  return classification_loss_fn
+def get_dataset_fn(input_file_pattern,
+                   max_seq_length,
+                   global_batch_size,
+                   is_training,
+                   label_type=tf.int64,
+                   include_sample_weights=False):
+  """Gets a closure to create a dataset."""
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_classifier_dataset(
+        tf.io.gfile.glob(input_file_pattern),
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx,
+        label_type=label_type,
+        include_sample_weights=include_sample_weights)
+    return dataset
+  return _dataset_fn
+def run_bert_classifier(strategy,
+                        bert_config,
+                        input_meta_data,
+                        model_dir,
+                        epochs,
+                        steps_per_epoch,
+                        steps_per_loop,
+                        eval_steps,
+                        warmup_steps,
+                        initial_lr,
+                        init_checkpoint,
+                        train_input_fn,
+                        eval_input_fn,
+                        training_callbacks=True,
+                        custom_callbacks=None,
+                        custom_metrics=None):
+  """Run BERT classifier training using low-level API."""
+  max_seq_length = input_meta_data['max_seq_length']
+  num_classes = input_meta_data.get('num_labels', 1)
+  is_regression = num_classes == 1
+  def _get_classifier_model():
+    """Gets a classifier model."""
+    classifier_model, core_model = (
+        bert_models.classifier_model(
+            bert_config,
+            num_classes,
+            max_seq_length,
+            hub_module_url=FLAGS.hub_module_url,
+            hub_module_trainable=FLAGS.hub_module_trainable))
+    optimizer = optimization.create_optimizer(initial_lr,
+                                              steps_per_epoch * epochs,
+                                              warmup_steps, FLAGS.end_lr,
+                                              FLAGS.optimizer_type)
+    classifier_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
+    return classifier_model, core_model
+  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming
+  # from the dataset) to compute weighted loss, as used for the regression
+  # tasks. The classification tasks, using the custom get_loss_fn don't accept
+  # sample weights though.
+  loss_fn = (tf.keras.losses.MeanSquaredError() if is_regression
+             else get_loss_fn(num_classes))
+  # Defines evaluation metrics function, which will create metrics in the
+  # correct device and strategy scope.
+  if custom_metrics:
+    metric_fn = custom_metrics
+  elif is_regression:
+    metric_fn = functools.partial(
+        tf.keras.metrics.MeanSquaredError,
+        'mean_squared_error',
+        dtype=tf.float32)
+  else:
+    metric_fn = functools.partial(
+        tf.keras.metrics.SparseCategoricalAccuracy,
+        'accuracy',
+        dtype=tf.float32)
+  # Start training using Keras compile/fit API.
+  logging.info('Training using TF 2.x Keras compile/fit API with '
+               'distribution strategy.')
+  return run_keras_compile_fit(
+      model_dir,
+      strategy,
+      _get_classifier_model,
+      train_input_fn,
+      eval_input_fn,
+      loss_fn,
+      metric_fn,
+      init_checkpoint,
+      epochs,
+      steps_per_epoch,
+      steps_per_loop,
+      eval_steps,
+      training_callbacks=training_callbacks,
+      custom_callbacks=custom_callbacks)
+def run_keras_compile_fit(model_dir,
+                          strategy,
+                          model_fn,
+                          train_input_fn,
+                          eval_input_fn,
+                          loss_fn,
+                          metric_fn,
+                          init_checkpoint,
+                          epochs,
+                          steps_per_epoch,
+                          steps_per_loop,
+                          eval_steps,
+                          training_callbacks=True,
+                          custom_callbacks=None):
+  """Runs BERT classifier model using Keras compile/fit API."""
+  with strategy.scope():
+    training_dataset = train_input_fn()
+    evaluation_dataset = eval_input_fn() if eval_input_fn else None
+    bert_model, sub_model = model_fn()
+    optimizer = bert_model.optimizer
+    if init_checkpoint:
+      checkpoint = tf.train.Checkpoint(model=sub_model)
+      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+    if not isinstance(metric_fn, (list, tuple)):
+      metric_fn = [metric_fn]
+    bert_model.compile(
+        optimizer=optimizer,
+        loss=loss_fn,
+        metrics=[fn() for fn in metric_fn],
+        experimental_steps_per_execution=steps_per_loop)
+    summary_dir = os.path.join(model_dir, 'summaries')
+ #   summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
+    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir, profile_batch=0)
+    checkpoint = tf.train.Checkpoint(model=bert_model, optimizer=optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        directory=model_dir,
+        max_to_keep=None,
+        step_counter=optimizer.iterations,
+        checkpoint_interval=0)
+    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
+    if training_callbacks:
+      if custom_callbacks is not None:
+        custom_callbacks += [summary_callback, checkpoint_callback]
+      else:
+        custom_callbacks = [summary_callback, checkpoint_callback]
+#xuan
+    #tf.keras.callbacks.TerminateOnNaN(custom_callbacks)
+    history = bert_model.fit(
+        x=training_dataset,
+        validation_data=evaluation_dataset,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        validation_steps=eval_steps,
+        callbacks=custom_callbacks)
+    stats = {'total_training_steps': steps_per_epoch * epochs}
+    if 'loss' in history.history:
+      stats['train_loss'] = history.history['loss'][-1]
+    if 'val_accuracy' in history.history:
+      stats['eval_metrics'] = history.history['val_accuracy'][-1]
+    return bert_model, stats
+def get_predictions_and_labels(strategy,
+                               trained_model,
+                               eval_input_fn,
+                               return_probs=False):
+  """Obtains predictions of trained model on evaluation data.
+  Note that list of labels is returned along with the predictions because the
+  order changes on distributing dataset over TPU pods.
+  Args:
+    strategy: Distribution strategy.
+    trained_model: Trained model with preloaded weights.
+    eval_input_fn: Input function for evaluation data.
+    return_probs: Whether to return probabilities of classes.
+  Returns:
+    predictions: List of predictions.
+    labels: List of gold labels corresponding to predictions.
+  """
+  @tf.function
+  def test_step(iterator):
+    """Computes predictions on distributed devices."""
+    def _test_step_fn(inputs):
+      """Replicated predictions."""
+      inputs, labels = inputs
+      logits = trained_model(inputs, training=False)
+      probabilities = tf.nn.softmax(logits)
+      return probabilities, labels
+    outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
+    # outputs: current batch logits as a tuple of shard logits
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    outputs)
+    labels = tf.nest.map_structure(strategy.experimental_local_results, labels)
+    return outputs, labels
+  def _run_evaluation(test_iterator):
+    """Runs evaluation steps."""
+    preds, golds = list(), list()
+    try:
+      with tf.experimental.async_scope():
+        while True:
+          probabilities, labels = test_step(test_iterator)
+          for cur_probs, cur_labels in zip(probabilities, labels):
+            if return_probs:
+              preds.extend(cur_probs.numpy().tolist())
+            else:
+              preds.extend(tf.math.argmax(cur_probs, axis=1).numpy())
+            golds.extend(cur_labels.numpy().tolist())
+    except (StopIteration, tf.errors.OutOfRangeError):
+      tf.experimental.async_clear_error()
+    return preds, golds
+  test_iter = iter(
+      strategy.experimental_distribute_datasets_from_function(eval_input_fn))
+  predictions, labels = _run_evaluation(test_iter)
+  return predictions, labels
+def export_classifier(model_export_path, input_meta_data, bert_config,
+                      model_dir):
+  """Exports a trained model as a `SavedModel` for inference.
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+    bert_config: Bert configuration file to define core bert layers.
+    model_dir: The directory where the model weights and training/evaluation
+      summaries are stored.
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  if not model_export_path:
+    raise ValueError('Export path is not specified: %s' % model_export_path)
+  if not model_dir:
+    raise ValueError('Export path is not specified: %s' % model_dir)
+  # Export uses float32 for now, even if training uses mixed precision.
+  tf.keras.mixed_precision.experimental.set_policy('float32')
+  classifier_model = bert_models.classifier_model(
+      bert_config, input_meta_data.get('num_labels', 1))[0]
+  model_saving_utils.export_bert_model(
+      model_export_path, model=classifier_model, checkpoint_dir=model_dir)
+def run_bert(strategy,
+             input_meta_data,
+             model_config,
+             train_input_fn=None,
+             eval_input_fn=None,
+             init_checkpoint=None,
+             custom_callbacks=None,
+             custom_metrics=None):
+  """Run BERT training."""
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_session_config(FLAGS.enable_xla)
+  performance.set_mixed_precision_policy(common_flags.dtype())
+  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
+  train_data_size = (
+      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
+  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
+  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
+  eval_steps = int(
+      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+  if not strategy:
+    raise ValueError('Distribution strategy has not been specified.')
+  if not custom_callbacks:
+    custom_callbacks = []
+  if FLAGS.log_steps:
+    custom_callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size=FLAGS.train_batch_size,
+            log_steps=FLAGS.log_steps,
+            logdir=FLAGS.model_dir))
+  trained_model, _ = run_bert_classifier(
+      strategy,
+      model_config,
+      input_meta_data,
+      FLAGS.model_dir,
+      epochs,
+      steps_per_epoch,
+      FLAGS.steps_per_loop,
+      eval_steps,
+      warmup_steps,
+      FLAGS.learning_rate,
+      init_checkpoint or FLAGS.init_checkpoint,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+  if FLAGS.model_export_path:
+    model_saving_utils.export_bert_model(
+        FLAGS.model_export_path, model=trained_model)
+  return trained_model
+def custom_main(custom_callbacks=None, custom_metrics=None):
+  """Run classification or regression.
+  Args:
+    custom_callbacks: list of tf.keras.Callbacks passed to training loop.
+    custom_metrics: list of metrics passed to the training loop.
+  """
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+  physical_devices = tf.config.list_physical_devices('GPU')
+  if len(physical_devices) > 0:
+      for device in physical_devices:
+          tf.config.experimental.set_memory_growth(device, True)
+          print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
+  else:
+      print("Not enough GPU hardware devices available")
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  label_type = LABEL_TYPES_MAP[input_meta_data.get('label_type', 'int')]
+  include_sample_weights = input_meta_data.get('has_sample_weights', False)
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  if FLAGS.mode == 'export_only':
+    export_classifier(FLAGS.model_export_path, input_meta_data, bert_config,
+                      FLAGS.model_dir)
+    return
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg="nccl",
+      num_packs=1,
+      tpu_address=FLAGS.tpu)
+  eval_input_fn = get_dataset_fn(
+      FLAGS.eval_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.eval_batch_size,
+      is_training=False,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights)
+  if FLAGS.mode == 'predict':
+    with strategy.scope():
+      classifier_model = bert_models.classifier_model(
+          bert_config, input_meta_data['num_labels'])[0]
+      checkpoint = tf.train.Checkpoint(model=classifier_model)
+      latest_checkpoint_file = (
+          FLAGS.predict_checkpoint_path or
+          tf.train.latest_checkpoint(FLAGS.model_dir))
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+      preds, _ = get_predictions_and_labels(
+          strategy, classifier_model, eval_input_fn, return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    return
+  if FLAGS.mode != 'train_and_eval':
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  train_input_fn = get_dataset_fn(
+      FLAGS.train_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.train_batch_size,
+      is_training=True,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights)
+  run_bert(
+      strategy,
+      input_meta_data,
+      bert_config,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+def main(_):
+  custom_main(custom_callbacks=None, custom_metrics=None)
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)