Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 32e4ca51 · 32e4ca51
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/data/sentence_prediction_dataloader_test.py
+++ b/official/nlp/data/sentence_prediction_dataloader_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/data/sentence_retrieval_lib.py
+++ b/official/nlp/data/sentence_retrieval_lib.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 import os

 from absl import logging
-from official.nlp.bert import tokenization
 from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization


 class BuccProcessor(classifier_data_lib.DataProcessor):

--- a/official/nlp/data/squad_lib.py
+++ b/official/nlp/data/squad_lib.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ import six
 from absl import logging
 import tensorflow as tf

-from official.nlp.bert import tokenization
+from official.nlp.tools import tokenization


 class SquadExample(object):

--- a/official/nlp/data/squad_lib_sp.py
+++ b/official/nlp/data/squad_lib_sp.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ from absl import logging
 import numpy as np
 import tensorflow as tf

-from official.nlp.bert import tokenization
+from official.nlp.tools import tokenization


 class SquadExample(object):

--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@ import os
 from absl import logging
 import tensorflow as tf

-from official.nlp.bert import tokenization
 from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization

 # A negative label id for the padding label, which will not contribute
 # to loss/metrics in training.

--- a/official/nlp/data/tagging_data_lib_test.py
+++ b/official/nlp/data/tagging_data_lib_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@ import random
 from absl.testing import parameterized
 import tensorflow as tf

-from official.nlp.bert import tokenization
 from official.nlp.data import tagging_data_lib
+from official.nlp.tools import tokenization


 def _create_fake_file(filename, labels, is_test):

--- a/official/nlp/data/tagging_dataloader.py
+++ b/official/nlp/data/tagging_dataloader.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/data/tagging_dataloader_test.py
+++ b/official/nlp/data/tagging_dataloader_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/data/train_sentencepiece.py
+++ b/official/nlp/data/train_sentencepiece.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ from sentencepiece import SentencePieceTrainer

 FLAGS = flags.FLAGS
 flags.DEFINE_string("output_model_path", None,
-                    "Path to save the the sentencepiece model.")
+                    "Path to save the sentencepiece model.")
 flags.mark_flag_as_required("output_model_path")

 flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")

--- a/official/nlp/data/wmt_dataloader.py
+++ b/official/nlp/data/wmt_dataloader.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/data/wmt_dataloader_test.py
+++ b/official/nlp/data/wmt_dataloader_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/docs/README.md
+++ b/official/nlp/docs/README.md
+This directory contain guides to help users to train NLP models.
+
+1. [Training guide](train.md) explain the steps to follow for training NLP
+models on GPU and TPU.
+
+2. [Pretrained_models guide](pretrained_models.md) explain how to load
+pre-trained NLP models (baselines and checkpoints) that can be finetuned
+further depending on application.
+
+3. [TF-Hub guide](tfhub.md) explain how to use TF-NLP's
+[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
+tool to export pre-trained Transformer encoders to SavedModels format that are
+suitable for publication on TF Hub.
--- a/official/nlp/docs/train.md
+++ b/official/nlp/docs/train.md
 # Model Garden NLP Common Training Driver

-[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) is the common training driver that supports multiple
+[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
+is the common training driver that supports multiple
 NLP tasks (e.g., pre-training, GLUE and SQuAD fine-tuning etc) and multiple
 models (e.g., BERT, ALBERT, MobileBERT etc).

 ## Experiment Configuration

-[train.py] is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
+[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
+is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
 including configurations for `task`, `trainer` and `runtime`. The pre-defined
 NLP related [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) can be found in
 [configs/experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiment_configs.py).
@@ -78,7 +80,9 @@ setting `task.validation_data.input_path` in `PARAMS`.

 ## Run on Cloud TPUs

-Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
+Next, we will describe how to run
+the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
+on Cloud TPUs.

 ### Setup
 First, you need to create a `tf-nightly` TPU with
@@ -99,7 +103,9 @@ pip3 install --user -r official/requirements.txt

 ### Fine-tuning Sentence Classification with BERT from TF-Hub

-This example fine-tunes BERT-base from TF-Hub on the the Multi-Genre Natural
+<details>
+
+This example fine-tunes BERT-base from TF-Hub on the Multi-Genre Natural
 Language Inference (MultiNLI) corpus using TPUs.

 Firstly, you can prepare the fine-tuning data using
@@ -163,8 +169,12 @@ python3 train.py \
 You can monitor the training progress in the console and find the output
 models in `$OUTPUT_DIR`.

+</details>
+
 ### Fine-tuning SQuAD with a pre-trained BERT checkpoint

+<details>
+
 This example fine-tunes a pre-trained BERT checkpoint on the
 Stanford Question Answering Dataset (SQuAD) using TPUs.
 The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
@@ -219,4 +229,73 @@ python3 train.py \

 ```

-Note: More examples about pre-training will come soon.
+### Pre-train a BERT from scratch
+
+</details>
+
+This example pre-trains a BERT model with Wikipedia and Books datasets used by
+the original BERT paper.
+The [BERT repo](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_pretraining_data.py)
+contains detailed information about the Wikipedia dump and
+[BookCorpus](https://yknzhu.wixsite.com/mbweb). Of course, the pre-training
+recipe is generic and you can apply the same recipe to your own corpus.
+
+Please use the script
+[`create_pretraining_data.py`](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_pretraining_data.py)
+which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
+to get processed pre-training data and it adapts to TF2 symbols and python3
+compatibility.
+
+Running the pre-training script requires an input and output directory, as well
+as a vocab file. Note that `max_seq_length` will need to match the sequence
+length parameter you specify when you run pre-training.
+
+```shell
+export WORKING_DIR='local disk or cloud location'
+export BERT_DIR='local disk or cloud location'
+python models/official/nlp/data/create_pretraining_data.py \
+  --input_file=$WORKING_DIR/input/input.txt \
+  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
+  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+  --do_lower_case=True \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+Then, you can update the yaml configuration file, e.g.
+`configs/experiments/wiki_books_pretrain.yaml` to specify your data paths and
+update masking-related hyper parameters to match with your specification for 
+the pretraining data. When your data have multiple shards, you can
+use `*` to include multiple files.
+
+To train different BERT sizes, you need to adjust:
+
+```
+model:
+  cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
+```
+
+to match the hidden dimensions.
+
+Then, you can start the training and evaluation jobs, which runs the
+[`bert/pretraining`](https://github.com/tensorflow/models/blob/master/official/nlp/configs/pretraining_experiments.py#L51)
+experiment:
+
+```shell
+export OUTPUT_DIR=gs://some_bucket/my_output_dir
+export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
+
+python3 train.py \
+ --experiment=bert/pretraining \
+ --mode=train_and_eval \
+ --model_dir=$OUTPUT_DIR \
+ --config_file=configs/models/bert_en_uncased_base.yaml \
+ --config_file=configs/experiments/wiki_books_pretrain.yaml \
+ --tpu=${TPU_NAME} \
+ --params_override=$PARAMS
+```
+
+Note: More examples about pre-training with TFDS datesets will come soon.
--- a/official/nlp/finetuning/binary_helper.py
+++ b/official/nlp/finetuning/binary_helper.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/finetuning/glue/flags.py
+++ b/official/nlp/finetuning/glue/flags.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/finetuning/glue/run_glue.py
+++ b/official/nlp/finetuning/glue/run_glue.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,9 +55,9 @@ EVAL_METRIC_MAP = {
    'AX': 'matthews_corrcoef',
    'COLA': 'matthews_corrcoef',
    'MNLI': 'cls_accuracy',
-    'MRPC': 'cls_accuracy',
+    'MRPC': 'f1',
    'QNLI': 'cls_accuracy',
-    'QQP': 'cls_accuracy',
+    'QQP': 'f1',
    'RTE': 'cls_accuracy',
    'SST-2': 'cls_accuracy',
    'STS-B': 'pearson_spearman_corr',
@@ -93,11 +93,16 @@ def _override_exp_config_by_flags(exp_config, input_meta_data):
        binary_helper.override_sentence_prediction_task_config,
        num_classes=input_meta_data['num_labels'],
        metric_type='matthews_corrcoef')
-  elif FLAGS.task_name in ('MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2',
+  elif FLAGS.task_name in ('MNLI', 'QNLI', 'RTE', 'SST-2',
                           'WNLI'):
    override_task_cfg_fn = functools.partial(
        binary_helper.override_sentence_prediction_task_config,
        num_classes=input_meta_data['num_labels'])
+  elif FLAGS.task_name in ('QQP', 'MRPC'):
+    override_task_cfg_fn = functools.partial(
+        binary_helper.override_sentence_prediction_task_config,
+        metric_type='f1',
+        num_classes=input_meta_data['num_labels'])
  elif FLAGS.task_name in ('STS-B',):
    override_task_cfg_fn = functools.partial(
        binary_helper.override_sentence_prediction_task_config,

--- a/official/nlp/finetuning/superglue/flags.py
+++ b/official/nlp/finetuning/superglue/flags.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/finetuning/superglue/run_superglue.py
+++ b/official/nlp/finetuning/superglue/run_superglue.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/metrics/__init__.py
+++ b/official/nlp/metrics/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.