Commit 32e4ca51 authored by qianyj's avatar qianyj
Browse files

Update code to v2.11.0

parents 9485aa1d 71060f67
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,8 +17,8 @@
import os
from absl import logging
from official.nlp.bert import tokenization
from official.nlp.data import classifier_data_lib
from official.nlp.tools import tokenization
class BuccProcessor(classifier_data_lib.DataProcessor):
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -25,7 +25,7 @@ import six
from absl import logging
import tensorflow as tf
from official.nlp.bert import tokenization
from official.nlp.tools import tokenization
class SquadExample(object):
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -28,7 +28,7 @@ from absl import logging
import numpy as np
import tensorflow as tf
from official.nlp.bert import tokenization
from official.nlp.tools import tokenization
class SquadExample(object):
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -19,8 +19,8 @@ import os
from absl import logging
import tensorflow as tf
from official.nlp.bert import tokenization
from official.nlp.data import classifier_data_lib
from official.nlp.tools import tokenization
# A negative label id for the padding label, which will not contribute
# to loss/metrics in training.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -19,8 +19,8 @@ import random
from absl.testing import parameterized
import tensorflow as tf
from official.nlp.bert import tokenization
from official.nlp.data import tagging_data_lib
from official.nlp.tools import tokenization
def _create_fake_file(filename, labels, is_test):
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -36,7 +36,7 @@ from sentencepiece import SentencePieceTrainer
FLAGS = flags.FLAGS
flags.DEFINE_string("output_model_path", None,
"Path to save the the sentencepiece model.")
"Path to save the sentencepiece model.")
flags.mark_flag_as_required("output_model_path")
flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
This directory contain guides to help users to train NLP models.
1. [Training guide](train.md) explain the steps to follow for training NLP
models on GPU and TPU.
2. [Pretrained_models guide](pretrained_models.md) explain how to load
pre-trained NLP models (baselines and checkpoints) that can be finetuned
further depending on application.
3. [TF-Hub guide](tfhub.md) explain how to use TF-NLP's
[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
tool to export pre-trained Transformer encoders to SavedModels format that are
suitable for publication on TF Hub.
# Model Garden NLP Common Training Driver
[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) is the common training driver that supports multiple
[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
is the common training driver that supports multiple
NLP tasks (e.g., pre-training, GLUE and SQuAD fine-tuning etc) and multiple
models (e.g., BERT, ALBERT, MobileBERT etc).
## Experiment Configuration
[train.py] is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
including configurations for `task`, `trainer` and `runtime`. The pre-defined
NLP related [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) can be found in
[configs/experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiment_configs.py).
......@@ -78,7 +80,9 @@ setting `task.validation_data.input_path` in `PARAMS`.
## Run on Cloud TPUs
Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
Next, we will describe how to run
the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py)
on Cloud TPUs.
### Setup
First, you need to create a `tf-nightly` TPU with
......@@ -99,7 +103,9 @@ pip3 install --user -r official/requirements.txt
### Fine-tuning Sentence Classification with BERT from TF-Hub
This example fine-tunes BERT-base from TF-Hub on the the Multi-Genre Natural
<details>
This example fine-tunes BERT-base from TF-Hub on the Multi-Genre Natural
Language Inference (MultiNLI) corpus using TPUs.
Firstly, you can prepare the fine-tuning data using
......@@ -163,8 +169,12 @@ python3 train.py \
You can monitor the training progress in the console and find the output
models in `$OUTPUT_DIR`.
</details>
### Fine-tuning SQuAD with a pre-trained BERT checkpoint
<details>
This example fine-tunes a pre-trained BERT checkpoint on the
Stanford Question Answering Dataset (SQuAD) using TPUs.
The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
......@@ -219,4 +229,73 @@ python3 train.py \
```
Note: More examples about pre-training will come soon.
### Pre-train a BERT from scratch
</details>
This example pre-trains a BERT model with Wikipedia and Books datasets used by
the original BERT paper.
The [BERT repo](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_pretraining_data.py)
contains detailed information about the Wikipedia dump and
[BookCorpus](https://yknzhu.wixsite.com/mbweb). Of course, the pre-training
recipe is generic and you can apply the same recipe to your own corpus.
Please use the script
[`create_pretraining_data.py`](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_pretraining_data.py)
which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
to get processed pre-training data and it adapts to TF2 symbols and python3
compatibility.
Running the pre-training script requires an input and output directory, as well
as a vocab file. Note that `max_seq_length` will need to match the sequence
length parameter you specify when you run pre-training.
```shell
export WORKING_DIR='local disk or cloud location'
export BERT_DIR='local disk or cloud location'
python models/official/nlp/data/create_pretraining_data.py \
--input_file=$WORKING_DIR/input/input.txt \
--output_file=$WORKING_DIR/output/tf_examples.tfrecord \
--vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=5
```
Then, you can update the yaml configuration file, e.g.
`configs/experiments/wiki_books_pretrain.yaml` to specify your data paths and
update masking-related hyper parameters to match with your specification for
the pretraining data. When your data have multiple shards, you can
use `*` to include multiple files.
To train different BERT sizes, you need to adjust:
```
model:
cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}]
```
to match the hidden dimensions.
Then, you can start the training and evaluation jobs, which runs the
[`bert/pretraining`](https://github.com/tensorflow/models/blob/master/official/nlp/configs/pretraining_experiments.py#L51)
experiment:
```shell
export OUTPUT_DIR=gs://some_bucket/my_output_dir
export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
python3 train.py \
--experiment=bert/pretraining \
--mode=train_and_eval \
--model_dir=$OUTPUT_DIR \
--config_file=configs/models/bert_en_uncased_base.yaml \
--config_file=configs/experiments/wiki_books_pretrain.yaml \
--tpu=${TPU_NAME} \
--params_override=$PARAMS
```
Note: More examples about pre-training with TFDS datesets will come soon.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -55,9 +55,9 @@ EVAL_METRIC_MAP = {
'AX': 'matthews_corrcoef',
'COLA': 'matthews_corrcoef',
'MNLI': 'cls_accuracy',
'MRPC': 'cls_accuracy',
'MRPC': 'f1',
'QNLI': 'cls_accuracy',
'QQP': 'cls_accuracy',
'QQP': 'f1',
'RTE': 'cls_accuracy',
'SST-2': 'cls_accuracy',
'STS-B': 'pearson_spearman_corr',
......@@ -93,11 +93,16 @@ def _override_exp_config_by_flags(exp_config, input_meta_data):
binary_helper.override_sentence_prediction_task_config,
num_classes=input_meta_data['num_labels'],
metric_type='matthews_corrcoef')
elif FLAGS.task_name in ('MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2',
elif FLAGS.task_name in ('MNLI', 'QNLI', 'RTE', 'SST-2',
'WNLI'):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
num_classes=input_meta_data['num_labels'])
elif FLAGS.task_name in ('QQP', 'MRPC'):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
metric_type='f1',
num_classes=input_meta_data['num_labels'])
elif FLAGS.task_name in ('STS-B',):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment