Unverified Commit f16a7b5b authored by vedanshu's avatar vedanshu Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

new pull
parents 8e9296ff 8f58f396
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A script to train sentencepiece model from tensorflow datasets.
Reserved tokens:
pad: 0,
eos: 1,
unk: 2
(bos is not reserved)
"""
import os
import tempfile
from typing import List, Tuple
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import tensorflow_datasets as tfds
from sentencepiece import SentencePieceTrainer
FLAGS = flags.FLAGS
flags.DEFINE_string("output_model_path", None,
"Path to save the the sentencepiece model.")
flags.mark_flag_as_required("output_model_path")
flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")
flags.DEFINE_string("tfds_name", "wmt14_translate/de-en",
"Name of the dataset we generate vacabulay from.")
flags.DEFINE_string("tfds_split", "train", "Split of the dataset.")
flags.DEFINE_integer("vocab_size", 32000, "Size of vocabulary.")
flags.DEFINE_integer(
"max_char", -1,
"Maximum number of characters to use. "
"If a non-positive number is provided, all sentences are used.")
flags.DEFINE_string("model_type", "bpe",
"Model algorithm: unigram, bpe, word or char.")
flags.DEFINE_float("character_coverage", 0.9995,
"Character coverage to determine the minimum symbols")
flags.DEFINE_list(
"data_keys", ["en", "de"],
"Comma-separated list of keys to use for training the vocabulary.")
def dump_chars_to_textfile(dataset: tf.data.Dataset,
data_keys: Tuple[str],
max_char: int = -1):
"""Write part of a TFDS sentence dataset to lines in a text file.
Args:
dataset: tf.dataset containing string-data.
data_keys: what keys in dataset to dump from.
max_char: max character to dump to text file.
Returns:
name of temp file with dataset bytes, exact number of characters dumped.
"""
ds_iter = dataset.as_numpy_iterator()
with tempfile.NamedTemporaryFile(delete=False) as outfp:
char_count = 0
while True:
example = next(ds_iter, None)
if example is None or (
max_char > 0 and char_count > max_char):
break
for k in data_keys:
line = example[k] + b"\n"
char_count += len(line)
outfp.write(line)
return outfp.name
def train_sentencepiece(
file_path: str,
model_path: str,
vocab_size: int,
character_coverage: float,
model_type: str):
"""Train SentencePiece tokenizer from subset of tf dataset.
Args:
file_path: path of data to train sentencepiece.
model_path: path of model file to save vocab model to.
vocab_size: size of vocab tokens to train.
character_coverage: amount of characters covered by the model, good defaults
are 0.9995 for languages with rich character set like Japanese or Chinese
and 1.0 for other languages with small character set.
model_type: type of sentencepiece vocab to train.
Returns:
path to the trained sentencepiece vocabulary model.
"""
argstr = " ".join([
f"--input={file_path}", f"--vocab_size={vocab_size}",
f"--character_coverage={character_coverage}",
f"--model_prefix={model_path}", f"--model_type={model_type}",
"--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
])
SentencePieceTrainer.Train(argstr)
def main(argv: List[str]):
del argv
builder = tfds.builder(FLAGS.tfds_name, data_dir=FLAGS.tfds_dir)
ds = builder.as_dataset(split=FLAGS.tfds_split)
tmp_filename = dump_chars_to_textfile(ds, FLAGS.data_keys, FLAGS.max_char)
logging.info("Sentencepiece model will be placed here: %s",
FLAGS.output_model_path)
train_sentencepiece(tmp_filename,
FLAGS.output_model_path,
FLAGS.vocab_size,
FLAGS.character_coverage,
FLAGS.model_type)
os.remove(tmp_filename)
if __name__ == "__main__":
app.run(main)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Input pipeline for the transformer model to read, filter, and batch examples.
Batching scheme
Prior to batching, elements in the dataset are grouped by length (max between
'inputs' and 'targets' length). Each group is then batched such that:
group_batch_size * length <= batch_size.
Another way to view batch_size is the maximum number of tokens in each batch.
Once batched, each element in the dataset will have the shape:
{'inputs': [group_batch_size, padded_input_length],
'targets': [group_batch_size, padded_target_length]}
Lengths are padded to the longest 'inputs' or 'targets' sequence in the batch
(padded_input_length and padded_target_length can be different).
This batching scheme decreases the fraction of padding tokens per training
batch, thus improving the training speed significantly.
"""
from typing import Dict, Optional
import dataclasses
import tensorflow as tf
import tensorflow_text as tftxt
from official.core import config_definitions as cfg
from official.core import input_reader
from official.nlp.data import data_loader
from official.nlp.data import data_loader_factory
# Example grouping constants. Defines length boundaries for each group.
# These values are the defaults used in Tensor2Tensor.
_MIN_BOUNDARY = 8
_BOUNDARY_SCALE = 1.1
def _get_example_length(example):
"""Returns the maximum length between the example inputs and targets."""
length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
return length
def _create_min_max_boundaries(max_length,
min_boundary=_MIN_BOUNDARY,
boundary_scale=_BOUNDARY_SCALE):
"""Create min and max boundary lists up to max_length.
For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
returned values will be:
buckets_min = [0, 4, 8, 16]
buckets_max = [4, 8, 16, 25]
Args:
max_length: The maximum length of example in dataset.
min_boundary: Minimum length in boundary.
boundary_scale: Amount to scale consecutive boundaries in the list.
Returns:
min and max boundary lists
"""
# Create bucket boundaries list by scaling the previous boundary or adding 1
# (to ensure increasing boundary sizes).
bucket_boundaries = []
x = min_boundary
while x < max_length:
bucket_boundaries.append(x)
x = max(x + 1, int(x * boundary_scale))
# Create min and max boundary lists from the initial list.
buckets_min = [0] + bucket_boundaries
buckets_max = bucket_boundaries + [max_length + 1]
return buckets_min, buckets_max
def _batch_examples(dataset, batch_size, max_length):
"""Group examples by similar lengths, and return batched dataset.
Each batch of similar-length examples are padded to the same length, and may
have different number of elements in each batch, such that:
group_batch_size * padded_length <= batch_size.
This decreases the number of padding tokens per batch, which improves the
training speed.
Args:
dataset: Dataset of unbatched examples.
batch_size: Max number of tokens per batch of examples.
max_length: Max number of tokens in an example input or target sequence.
Returns:
Dataset of batched examples with similar lengths.
"""
# Get min and max boundary lists for each example. These are used to calculate
# the `bucket_id`, which is the index at which:
# buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
# Note that using both min and max lists improves the performance.
buckets_min, buckets_max = _create_min_max_boundaries(max_length)
# Create list of batch sizes for each bucket_id, so that
# bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
# Validates bucket batch sizes.
if any([batch_size <= 0 for batch_size in bucket_batch_sizes]):
raise ValueError(
'The token budget, global batch size, is too small to yeild 0 bucket '
'window: %s' % str(bucket_batch_sizes))
# bucket_id will be a tensor, so convert this list to a tensor as well.
bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
def example_to_bucket_id(example):
"""Return int64 bucket id for this example, calculated based on length."""
example_input = example['inputs']
example_target = example['targets']
seq_length = _get_example_length((example_input, example_target))
conditions_c = tf.logical_and(
tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
buckets_max))
bucket_id = tf.reduce_min(tf.where(conditions_c))
return bucket_id
def window_size_fn(bucket_id):
"""Return number of examples to be grouped when given a bucket id."""
return bucket_batch_sizes[bucket_id]
def batching_fn(bucket_id, grouped_dataset):
"""Batch and add padding to a dataset of elements with similar lengths."""
bucket_batch_size = window_size_fn(bucket_id)
# Batch the dataset and add padding so that all input sequences in the
# examples have the same length, and all target sequences have the same
# lengths as well. Resulting lengths of inputs and targets can differ.
padded_shapes = dict([
(name, [None] * len(spec.shape))
for name, spec in grouped_dataset.element_spec.items()
])
return grouped_dataset.padded_batch(bucket_batch_size, padded_shapes)
return dataset.apply(
tf.data.experimental.group_by_window(
key_func=example_to_bucket_id,
reduce_func=batching_fn,
window_size=None,
window_size_func=window_size_fn))
@dataclasses.dataclass
class WMTDataConfig(cfg.DataConfig):
"""Data config for WMT translation."""
max_seq_length: int = 64
static_batch: bool = False
sentencepiece_model_path: str = ''
src_lang: str = ''
tgt_lang: str = ''
transform_and_batch: bool = True
has_unique_id: bool = False
@data_loader_factory.register_data_loader_cls(WMTDataConfig)
class WMTDataLoader(data_loader.DataLoader):
"""A class to load dataset for WMT translation task."""
def __init__(self, params: WMTDataConfig):
self._params = params
self._max_seq_length = params.max_seq_length
self._static_batch = params.static_batch
self._global_batch_size = params.global_batch_size
if self._params.transform_and_batch:
self._tokenizer = tftxt.SentencepieceTokenizer(
model=tf.io.gfile.GFile(params.sentencepiece_model_path, 'rb').read(),
add_eos=True)
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
name_to_features = {
self._params.src_lang: tf.io.FixedLenFeature([], tf.string),
self._params.tgt_lang: tf.io.FixedLenFeature([], tf.string),
}
if self._params.has_unique_id:
name_to_features['unique_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in example:
t = example[name]
if t.dtype == tf.int64:
t = tf.cast(t, tf.int32)
example[name] = t
return example
def _tokenize(self, inputs) -> Dict[str, tf.Tensor]:
tokenized_inputs = {}
for k, v in inputs.items():
if k == self._params.src_lang:
tokenized_inputs['inputs'] = self._tokenizer.tokenize(v)
elif k == self._params.tgt_lang:
tokenized_inputs['targets'] = self._tokenizer.tokenize(v)
else:
tokenized_inputs[k] = v
print(tokenized_inputs)
return tokenized_inputs
def _filter_max_length(self, inputs):
# return tf.constant(True)
return tf.logical_and(
tf.shape(inputs['inputs'])[0] <= self._max_seq_length,
tf.shape(inputs['targets'])[0] <= self._max_seq_length)
def _maybe_truncate(self, inputs):
truncated_inputs = {}
for k, v in inputs.items():
if k == 'inputs' or k == 'targets':
truncated_inputs[k] = tf.pad(
v[:self._max_seq_length - 1], [[0, 1]],
constant_values=1) if tf.shape(v)[0] > self._max_seq_length else v
else:
truncated_inputs[k] = v
return truncated_inputs
def _tokenize_bucketize_and_batch(
self,
dataset,
input_context: Optional[tf.distribute.InputContext] = None):
dataset = dataset.map(
self._tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
if self._params.is_training:
dataset = dataset.filter(self._filter_max_length)
else:
dataset = dataset.map(
self._maybe_truncate,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
per_replica_batch_size = input_context.get_per_replica_batch_size(
self._global_batch_size) if input_context else self._global_batch_size
if self._static_batch:
padded_shapes = {}
for name, _ in dataset.element_spec.items():
if name == 'unique_id':
padded_shapes[name] = []
else:
padded_shapes[name] = [self._max_seq_length
] if self._static_batch else [None]
batch_size = per_replica_batch_size
if self._params.is_training:
batch_size = int(batch_size // self._max_seq_length)
dataset = dataset.padded_batch(
batch_size,
padded_shapes,
drop_remainder=True)
else:
# Group and batch such that each batch has examples of similar length.
dataset = _batch_examples(dataset, per_replica_batch_size,
self._max_seq_length)
# Prefetch the next element to improve speed of input pipeline.
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return dataset
def load(self, input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a tf.dataset.Dataset."""
decoder_fn = None
# Only decode for TFRecords.
if self._params.input_path:
decoder_fn = self._decode
def _identity(
dataset, input_context: Optional[tf.distribute.InputContext] = None):
del input_context
return dataset
transform_and_batch_fn = _identity
if self._params.transform_and_batch:
transform_and_batch_fn = self._tokenize_bucketize_and_batch
reader = input_reader.InputReader(
params=self._params,
decoder_fn=decoder_fn,
transform_and_batch_fn=transform_and_batch_fn)
return reader.read(input_context)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.wmt_dataloader."""
import os
from absl.testing import parameterized
import tensorflow as tf
from sentencepiece import SentencePieceTrainer
from official.nlp.data import wmt_dataloader
def _generate_line_file(filepath, lines):
with tf.io.gfile.GFile(filepath, 'w') as f:
for l in lines:
f.write('{}\n'.format(l))
def _generate_record_file(filepath, src_lines, tgt_lines, unique_id=False):
writer = tf.io.TFRecordWriter(filepath)
for i, (src, tgt) in enumerate(zip(src_lines, tgt_lines)):
features = {
'en': tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[src.encode()])),
'reverse_en': tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[tgt.encode()])),
}
if unique_id:
features['unique_id'] = tf.train.Feature(
int64_list=tf.train.Int64List(value=[i])),
example = tf.train.Example(
features=tf.train.Features(
feature=features))
writer.write(example.SerializeToString())
writer.close()
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
argstr = ' '.join([
f'--input={input_path}', f'--vocab_size={vocab_size}',
'--character_coverage=0.995',
f'--model_prefix={model_path}', '--model_type=bpe',
'--bos_id=-1', '--pad_id=0', f'--eos_id={eos_id}', '--unk_id=2'
])
SentencePieceTrainer.Train(argstr)
class WMTDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(WMTDataLoaderTest, self).setUp()
self._temp_dir = self.get_temp_dir()
src_lines = [
'abc ede fg',
'bbcd ef a g',
'de f a a g'
]
tgt_lines = [
'dd cc a ef g',
'bcd ef a g',
'gef cd ba'
]
self._record_train_input_path = os.path.join(self._temp_dir, 'train.record')
_generate_record_file(self._record_train_input_path, src_lines, tgt_lines)
self._record_test_input_path = os.path.join(self._temp_dir, 'test.record')
_generate_record_file(self._record_test_input_path, src_lines, tgt_lines,
unique_id=True)
self._sentencepeice_input_path = os.path.join(self._temp_dir, 'inputs.txt')
_generate_line_file(self._sentencepeice_input_path, src_lines + tgt_lines)
sentencepeice_model_prefix = os.path.join(self._temp_dir, 'sp')
_train_sentencepiece(self._sentencepeice_input_path, 20,
sentencepeice_model_prefix)
self._sentencepeice_model_path = '{}.model'.format(
sentencepeice_model_prefix)
@parameterized.named_parameters(
('train_static', True, True, 100, (2, 35)),
('train_non_static', True, False, 100, (12, 7)),
('non_train_static', False, True, 3, (3, 35)),
('non_train_non_static', False, False, 50, (2, 7)),)
def test_load_dataset(
self, is_training, static_batch, batch_size, expected_shape):
data_config = wmt_dataloader.WMTDataConfig(
input_path=self._record_train_input_path
if is_training else self._record_test_input_path,
max_seq_length=35,
global_batch_size=batch_size,
is_training=is_training,
static_batch=static_batch,
src_lang='en',
tgt_lang='reverse_en',
sentencepiece_model_path=self._sentencepeice_model_path)
dataset = wmt_dataloader.WMTDataLoader(data_config).load()
examples = next(iter(dataset))
inputs, targets = examples['inputs'], examples['targets']
self.assertEqual(inputs.shape, expected_shape)
self.assertEqual(targets.shape, expected_shape)
def test_load_dataset_raise_invalid_window(self):
batch_tokens_size = 10 # this is too small to form buckets.
data_config = wmt_dataloader.WMTDataConfig(
input_path=self._record_train_input_path,
max_seq_length=100,
global_batch_size=batch_tokens_size,
is_training=True,
static_batch=False,
src_lang='en',
tgt_lang='reverse_en',
sentencepiece_model_path=self._sentencepeice_model_path)
with self.assertRaisesRegex(
ValueError, 'The token budget, global batch size, is too small.*'):
_ = wmt_dataloader.WMTDataLoader(data_config).load()
if __name__ == '__main__':
tf.test.main()
# Pre-trained Models
We provide a large collection of baselines and checkpoints for NLP pre-trained
models.
## How to Load Pretrained Models
### How to Initialize from Checkpoint
**Note:** TF-HUB/Savedmodel is the preferred way to distribute models as it is
self-contained. Please consider using TF-HUB for finetuning tasks first.
If you use the [NLP training library](train.md),
you can specify the checkpoint path link directly when launching your job. For
example, to initialize the model from the checkpoint, you can specify
`--params_override=task.init_checkpoint=PATH_TO_INIT_CKPT` as:
```
python3 train.py \
--params_override=task.init_checkpoint=PATH_TO_INIT_CKPT
```
### How to load TF-HUB SavedModel
Finetuning tasks such as question answering (SQuAD) and sentence
prediction (GLUE) support loading a model from TF-HUB. These built-in tasks
support a specific `task.hub_module_url` parameter. To set this parameter,
replace `--params_override=task.init_checkpoint=...` with
`--params_override=task.hub_module_url=TF_HUB_URL`, like below:
```
python3 train.py \
--params_override=task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
```
## BERT
Public BERT pre-trained models released by the BERT authors.
We released both checkpoints and tf.hub modules as the pretrained models for
fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
released in TF 1.x official BERT repository
[google-research/bert](https://github.com/google-research/bert)
in order to keep consistent with BERT paper.
### Checkpoints
Model | Configuration | Training Data | Checkpoint & Vocabulary | TF-HUB SavedModels
---------------------------------------- | :--------------------------: | ------------: | ----------------------: | ------:
BERT-base uncased English | uncased_L-12_H-768_A-12 | Wiki + Books | [uncased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)
BERT-base cased English | cased_L-12_H-768_A-12 | Wiki + Books | [cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)
BERT-large uncased English | uncased_L-24_H-1024_A-16 | Wiki + Books | [uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)
BERT-large cased English | cased_L-24_H-1024_A-16 | Wiki + Books | [cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)
BERT-large, Uncased (Whole Word Masking) | wwm_uncased_L-24_H-1024_A-16 | Wiki + Books | [wwm_uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)
BERT-large, Cased (Whole Word Masking) | wwm_cased_L-24_H-1024_A-16 | Wiki + Books | [wwm_cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)
BERT-base MultiLingual | multi_cased_L-12_H-768_A-12 | Wiki + Books | [multi_cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/multi_cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)
BERT-base Chinese | chinese_L-12_H-768_A-12 | Wiki + Books | [chinese_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/chinese_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)
You may explore more in the TF-Hub BERT collection:
https://tfhub.dev/google/collections/bert/1
### BERT variants
We also have pretrained BERT models with variants in both network architecture
and training methodologies. These models achieve higher downstream accuracy
scores.
Model | Configuration | Training Data | TF-HUB SavedModels | Comment
-------------------------------- | :----------------------: | -----------------------: | ------------------------------------------------------------------------------------: | ------:
BERT-base talking heads + ggelu | uncased_L-12_H-768_A-12 | Wiki + Books | [talkheads_ggelu_base](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1) | BERT-base trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
BERT-large talking heads + ggelu | uncased_L-24_H-1024_A-16 | Wiki + Books | [talkheads_ggelu_large](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1) | BERT-large trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
LAMBERT-large uncased English | uncased_L-24_H-1024_A-16 | Wiki + Books | [lambert](https://tfhub.dev/tensorflow/lambert_en_uncased_L-24_H-1024_A-16/1) | BERT trained with LAMB and techniques from RoBERTa.
# Exporting a pre-trained Encoder to TF Hub
## Overview
This doc explains how to use TF-NLP's
[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
tool to export pre-trained Transformer encoders to SavedModels suitable for
publication on TF Hub. (For the steps after that, see TF Hub's
[publisher guide](https://www.tensorflow.org/hub/publish).)
For testing purposes, those SavedModels can also be used from their export
locations on the filesystem.
On TF Hub, Transformer encoders for text come as a pair of SavedModels:
* The preprocessing model applies a tokenizer with a fixed vocab plus some
additional logic to turn text into Transformer inputs.
* The encoder model (or "model" for short) applies the pre-trained Transformer
encoder.
TF Hub defines
[Common APIs](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders)
for all SavedModels of those two respective types, encapsulating the particular
choice of preprocessing logic and Encoder architecture.
## Exporting the Encoder
There is a choice between exporting just the encoder, or the encoder plus the
prediction head for the masked language model (MLM) task from pre-training.
Exporting just the encoder suffices for many straightforward applications.
### Exporting the Encoder alone
To export an encoder-only model, you can set `--export_type=model` and run the
tool like this:
```shell
python official/nlp/tools/export_tfhub.py \
--encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
--model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
--vocab_file=${BERT_DIR:?}/vocab.txt \
--export_type=model \
--export_path=/tmp/bert_model
```
The flag `--encoder_config_file` refers to a YAML file representing the
[encoders.EncoderConfig](https://github.com/tensorflow/models/search?q=EncoderConfig+path%3Aofficial%2Fnlp%2Fconfigs+filename%3Aencoders.py)
dataclass, which supports multiple encoders (e.g., BERT, ALBERT). Instead of
`--encoder_config_file`, you can set `--bert_config_file` to a legacy
`bert_config.json` file to export a BERT model. If the model definition involves
[GIN](https://github.com/google/gin-config), the flags `--gin_file` and
`--gin_params` must be set accordingly, consistent with pre-training.
The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
by
[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
or any other checkpoint that can be restored to
`tf.train.Checkpoint(encoder=encoder)` for the encoder defined by the config
flags. Legacy checkpoints with `model=` instead of `encoder=` are also supported
for now.
The exported SavedModel expects dict inputs and outputs as follows, implementing
a specialization of the respective
[Common SavedModel API](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders):
```python
encoder = hub.load(...)
encoder_inputs = dict(
input_word_ids=..., # Shape [batch, seq_length], dtype=int32
input_mask=..., # Shape [batch, seq_length], dtype=int32
input_type_ids=..., # Shape [batch, seq_length], dtype=int32
)
encoder_outputs = encoder(encoder_inputs)
assert encoder_outputs.keys() == {
"pooled_output", # Shape [batch_size, width], dtype=float32
"default", # Alias for "pooled_output" (aligns with other models)
"sequence_output", # Shape [batch_size, seq_length, width], dtype=float32
"encoder_outputs", # List of Tensors with outputs of all transformer layers
}
```
The encoder's pooler layer is restored from the `--model_checkpoint_path`.
However, unlike classic BERT, `BertPretrainerV2` does not train the pooler layer
of the encoder. You have three options to handle that:
* Set flag `--copy_pooler_dense_to_encoder` to copy the pooling layer from the
`ClassificationHead` passed to `BertPretrainerV2` for the next sentence
prediction task. This mimicks classic BERT, but is not recommended for new
models (see next item).
* Leave flag `--copy_pooler_dense_to_encoder` unset and export the untrained,
randomly initialized pooling layer of the encoder. Folklore (as of 2020) has
it that an untrained pooler gets fine-tuned better than a pre-trained
pooler, so this is the default.
* Leave flag `--copy_pooler_dense_to_encoder` unset and perform your own
initialization of the pooling layer before export. For example, Google's
[BERT Experts](https://tfhub.dev/google/collections/experts/bert/1)
published in October 2020 initialize it to the identity map, reporting equal
gains if fine-tuning, and more predictable behavior if not.
In any case, at this time, the export tool requires the encoder model to *have*
a `pooled_output`, whether trained or not. (This can be revised in the future.)
The encoder model does not include any preprocessing logic, but for the benefit
of users who take preprocessing into their own hands, the relevant information
is attached from flags `--vocab_file` or `--sp_model_file`, resp., and
`--do_lower_case`, which need to be set in exactly the same way as for the
preprocessing model (see below).
The root object of the exported SavedModel stores the resulting values as
attributes on the root object:
```python
encoder = hub.load(...)
# Gets the filename of the respective tf.saved_model.Asset object.
if hasattr(encoder, "vocab_file"):
print("Wordpiece vocab at", encoder.vocab_file.asset_path.numpy())
elif hasattr(encoder, "sp_model_file"):
print("SentencePiece model at", encoder.sp_model_file.asset_path.numpy())
# Gets the value of a scalar bool tf.Variable.
print("...using do_lower_case =", encoder.do_lower_case.numpy())
```
New users are encouraged to ignore these attributes and use the preprocessing
model instead. However, there are legacy users, and advanced users that require
access to the full vocab.
### Exporting the Encoder with a Masked Language Model head
To export an encoder and the masked language model it was trained with, first
read the preceding section about exporting just the encoder. All the
explanations there on setting the right flags apply here as well, up to the
following differences.
The masked language model is added to the export by changing flag
`--export_type` from `model` to `model_with_mlm`, so the export command looks
like this:
```shell
python official/nlp/tools/export_tfhub.py \
--encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
--model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
--vocab_file=${BERT_DIR:?}/vocab.txt \
--export_type=model_with_mlm \
--export_path=/tmp/bert_model
```
The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
by
[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
or any other checkpoint that can be restored to
`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)` with the encoder
defined by the config flags.
This is a more comprehensive requirement on the checkpoint than for
`--export_type=model`; not all Transformer encoders and not all pre-training
techniques can satisfy it. For example,
[ELECTRA](https://arxiv.org/abs/2003.10555) uses the BERT architecture but is
pre-trained without an MLM task.
The root object of the exported SavedModel is called in the same way as above.
In addition, the SavedModel has an `mlm` subobject that can be called as follows
to output an `mlm_logits` tensor as well:
```python
mlm_inputs = dict(
input_word_ids=..., # Shape [batch, seq_length], dtype=int32
input_mask=..., # Shape [batch, seq_length], dtype=int32
input_type_ids=..., # Shape [batch, seq_length], dtype=int32
masked_lm_positions=..., # Shape [batch, num_predictions], dtype=int32
)
mlm_outputs = encoder.mlm(mlm_inputs)
assert mlm_outputs.keys() == {
"pooled_output", # Shape [batch, width], dtype=float32
"sequence_output", # Shape [batch, seq_length, width], dtype=float32
"encoder_outputs", # List of Tensors with outputs of all transformer layers
"mlm_logits" # Shape [batch, num_predictions, vocab_size], dtype=float32
}
```
The extra subobject imposes a moderate size overhead.
### Exporting from a TF1 BERT checkpoint
A BERT model trained with the
[original BERT implementation for TF1](https://github.com/google-research/bert)
can be exported after converting its checkpoint with the
[tf2_encoder_checkpoint_converter](https://github.com/tensorflow/models/blob/master/official/nlp/bert/tf2_encoder_checkpoint_converter.py)
tool.
After that, run
[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
per the instructions above on the converted checkpoint. Do not set
`--copy_pooler_dense_to_encoder`, because the pooler layer is part of the
converted encoder. For `--vocab_file` and `--do_lower_case`, the values from TF1
BERT can be used verbatim.
## Exporting the preprocessing model
You can skip this step if TF Hub already has a preprocessing model that does
exactly what your encoder needs (same tokenizer, same vocab, same normalization
settings (`do_lower_case`)). You can inspect its collection of
[Transformer Encoders for Text](https://tfhub.dev/google/collections/transformer_encoders_text/1)
and click through to models with a similar input domain to find their
preprocessing models.
To export the preprocessing model, set `--export_type=preprocessing` and run the
export tool like this:
```shell
python official/nlp/tools/export_tfhub.py \
--vocab_file=${BERT_DIR:?}/vocab.txt \
--do_lower_case=True \
--export_type=preprocessing \
--export_path=/tmp/bert_preprocessing
```
Note: Set flag `--experimental_disable_assert_in_preprocessing` when exporting
to users of the public TensorFlow releases 2.4.x to avoid a fatal ops placement
issue when preprocessing is used within Dataset.map() on TPU workers.
This is not an issue with TF2.3 and TF2.5+.
Flag `--vocab_file` specifies the vocab file used with
[BertTokenizer](https://github.com/tensorflow/models/search?q=BertTokenizer+filename%3Atext_layers.py).
For models that use the
[SentencepieceTokenizer](https://github.com/tensorflow/models/search?q=SentencepieceTokenizer+filename%3Atext_layers.py),
set flag `--sp_model_file` instead.
The boolean flag `--do_lower_case` controls text normalization (as in the
respective tokenizer classes, so it's a bit more than just smashing case). If
unset, do_lower_case will be enabled if 'uncased' appears in --vocab_file, or
unconditionally if --sp_model_file is set, mimicking the conventions of BERT and
ALBERT, respectively. For programmatic use, or if in doubt, it's best to set
`--do_lower_case` explicity.
If the definition of preprocessing involved
[GIN](https://github.com/google/gin-config),
the flags `--gin_file` and `--gin_params` would have to be set accordingly,
consistent with pre-training. (At the time of this writing, no such GIN
configurables exist in the code.)
The exported SavedModel can be called in the following way for a single segment
input.
```python
preprocessor = hub.load(...)
text_input = ... # Shape [batch_size], dtype=tf.string
encoder_inputs = preprocessor(text_input, seq_length=seq_length)
assert encoder_inputs.keys() == {
"input_word_ids", # Shape [batch_size, seq_length], dtype=int32
"input_mask", # Shape [batch_size, seq_length], dtype=int32
"input_type_ids" # Shape [batch_size, seq_length], dtype=int32
}
```
Flag `--default_seq_length` controls the value of `seq_length` if that argument
is omitted in the usage example above. The flag defaults to 128, because
mutiples of 128 work best for Cloud TPUs, yet the cost of attention computation
grows quadratically with `seq_length`.
Beyond this example, the exported SavedModel implements the full set interface
from the preprocessor API for text embeddings with preprocessed inputs and with
Transformer encoders from TF Hub's
[Common APIs for text](https://www.tensorflow.org/hub/common_saved_model_apis/text).
Please see
[tfhub.dev/tensorflow/bert_en_uncased_preprocess](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess)
for the full documentation of one preprocessing model exported with this tool,
especially how custom trimming of inputs can happen between `.tokenize` and
`.bert_pack_inputs`.
Using the `encoder.mlm()` interface requires masking of tokenized inputs by user
code. The necessary information on the vocabulary encapsulated in the
preprocessing model can be obtained like this (uniformly across tokenizers):
```python
special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
vocab_size = int(special_tokens_dict["vocab_size"])
padding_id = int(special_tokens_dict["padding_id"]) # [PAD] or <pad>
start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"]) # [CLS]
end_of_segment_id = int(special_tokens_dict["end_of_segment_id"]) # [SEP]
mask_id = int(special_tokens_dict["mask_id"]) # [MASK]
```
## Testing the exported models
Please test your SavedModels before publication by fine-tuning them on a
suitable task and comparing performance and accuracy to a baseline experiment
built from equivalent Python code.
The
[trainer doc](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
has instructions how to run BERT on MNLI and other tasks from the GLUE
benchmark.
# Model Garden NLP Common Training Driver
[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) is the common training driver that supports multiple
NLP tasks (e.g., pre-training, GLUE and SQuAD fine-tuning etc) and multiple
models (e.g., BERT, ALBERT, MobileBERT etc).
## Experiment Configuration
[train.py] is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
including configurations for `task`, `trainer` and `runtime`. The pre-defined
NLP related [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) can be found in
[configs/experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiment_configs.py).
## Experiment Registry
We use an [experiment registry](https://github.com/tensorflow/models/blob/master/official/core/exp_factory.py) to build a mapping
between experiment type to experiment configuration instance. For example,
[configs/finetuning_experiments.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)
registers `bert/sentence_prediction` and `bert/squad` experiments. User can use
`--experiment` FLAG to invoke a registered experiment configuration,
e.g., `--experiment=bert/sentence_prediction`.
## Overriding Configuration via Yaml and FLAGS
The registered experiment configuration can be overridden by one or
multiple Yaml files provided by `--config_file` FLAG. For example:
```shell
--config_file=configs/experiments/glue_mnli_matched.yaml \
--config_file=configs/models/bert_en_uncased_base.yaml
```
In addition, experiment configuration can be further overriden by
`params_override` FLAG. For example:
```shell
--params_override=task.train_data.input_path=/some/path,task.hub_module_url=/some/tfhub
```
## Run on Cloud TPUs
Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
### Setup
First, you need to create a `tf-nightly` TPU with
[ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
```shell
export TPU_NAME=YOUR_TPU_NAME
ctpu up -name $TPU_NAME --tf-version=nightly --tpu-size=YOUR_TPU_SIZE --project=YOUR_PROJECT
```
and then install Model Garden and required dependencies:
```shell
git clone https://github.com/tensorflow/models.git
export PYTHONPATH=$PYTHONPATH:/path/to/models
pip3 install --user -r official/requirements.txt
```
### Fine-tuning Sentence Classification with BERT from TF-Hub
This example fine-tunes BERT-base from TF-Hub on the the Multi-Genre Natural
Language Inference (MultiNLI) corpus using TPUs.
Firstly, you can prepare the fine-tuning data using
[`create_finetuning_data.py`](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_finetuning_data.py) script.
For GLUE tasks, you can (1) download the
[GLUE data](https://gluebenchmark.com/tasks) by running
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
and unpack it to some directory `$GLUE_DIR`, (2) prepare the vocabulary file,
and (3) run the following command:
```shell
export GLUE_DIR=~/glue
export VOCAB_FILE=~/uncased_L-12_H-768_A-12/vocab.txt
export TASK_NAME=MNLI
export OUTPUT_DATA_DIR=gs://some_bucket/datasets
python3 data/create_finetuning_data.py \
--input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
--vocab_file=${VOCAB_FILE} \
--train_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_train.tf_record \
--eval_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_eval.tf_record \
--meta_data_file_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_meta_data \
--fine_tuning_task_type=classification --max_seq_length=128 \
--classification_task_name=${TASK_NAME}
```
Resulting training and evaluation datasets in `tf_record` format will be later
passed to [train.py](train.py). We will support to read dataset from
tensorflow_datasets (TFDS) and use tf.text for pre-processing soon.
Then you can execute the following commands to start the training and evaluation
job.
```shell
export INPUT_DATA_DIR=gs://some_bucket/datasets
export OUTPUT_DIR=gs://some_bucket/my_output_dir
# See tfhub BERT collection for more tfhub models:
# https://tfhub.dev/google/collections/bert/1
export BERT_HUB_URL=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
# Override the configurations by FLAGS. Alternatively, you can directly edit
# `configs/experiments/glue_mnli_matched.yaml` to specify corresponding fields.
export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/mnli_train.tf_record
export PARAMS=$PARAMS,task.validation_data.input_path=$INPUT_DATA_DIR/mnli_eval.tf_record
export PARAMS=$PARAMS,task.hub_module_url=$BERT_HUB_URL
export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
python3 train.py \
--experiment=bert/sentence_prediction \
--mode=train_and_eval \
--model_dir=$OUTPUT_DIR \
--config_file=configs/experiments/glue_mnli_matched.yaml \
--tfhub_cache_dir=$OUTPUT_DIR/hub_cache \
--tpu=${TPU_NAME} \
--params_override=$PARAMS
```
You can monitor the training progress in the console and find the output
models in `$OUTPUT_DIR`.
### Fine-tuning SQuAD with a pre-trained BERT checkpoint
This example fine-tunes a pre-trained BERT checkpoint on the
Stanford Question Answering Dataset (SQuAD) using TPUs.
The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
detailed information about the SQuAD datasets and evaluation. After downloading
the SQuAD datasets and the [pre-trained BERT checkpoints](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md),
you can run the following command to prepare the `tf_record` files:
```shell
export SQUAD_DIR=~/squad
export BERT_DIR=~/uncased_L-12_H-768_A-12
export OUTPUT_DATA_DIR=gs://some_bucket/datasets
python3 create_finetuning_data.py \
--squad_data_file=${SQUAD_DIR}/train-v1.1.json \
--vocab_file=${BERT_DIR}/vocab.txt \
--train_data_output_path=${OUTPUT_DATA_DIR}/train.tf_record \
--meta_data_file_path=${OUTPUT_DATA_DIR}/squad_meta_data \
--fine_tuning_task_type=squad --max_seq_length=384
```
Note: To create fine-tuning data with SQuAD 2.0, you need to add flag `--version_2_with_negative=True`.
Then, you can start the training and evaluation jobs:
```shell
export SQUAD_DIR=~/squad
export INPUT_DATA_DIR=gs://some_bucket/datasets
export OUTPUT_DIR=gs://some_bucket/my_output_dir
# See the following link for more pre-trained checkpoints:
# https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md
export BERT_DIR=~/uncased_L-12_H-768_A-12
# Override the configurations by FLAGS. Alternatively, you can directly edit
# `configs/experiments/squad_v1.1.yaml` to specify corresponding fields.
# Also note that the training data is the pre-processed tf_record file, while
# the validation file is the raw json file.
export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/train.tf_record
export PARAMS=$PARAMS,task.validation_data.input_path=$SQUAD_DIR/dev-v1.1.json
export PARAMS=$PARAMS,task.validation_data.vocab_file=$BERT_DIR/vocab.txt
export PARAMS=$PARAMS,task.init_checkpoint=$BERT_DIR/bert_model.ckpt
export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
python3 train.py \
--experiment=bert/squad \
--mode=train_and_eval \
--model_dir=$OUTPUT_DIR \
--config_file=configs/experiments/squad_v1.1.yaml \
--tpu=${TPU_NAME} \
--params_override=$PARAMS
```
Note: More examples about pre-training will come soon.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The helper for finetuning binaries."""
import json
import math
import sys
from typing import Any, Dict, List, Optional
from absl import logging
import tensorflow as tf
from official.core import config_definitions as cfg
from official.modeling import hyperparams
from official.nlp.configs import encoders
from official.nlp.data import question_answering_dataloader
from official.nlp.data import sentence_prediction_dataloader
from official.nlp.data import tagging_dataloader
from official.nlp.tasks import question_answering
from official.nlp.tasks import sentence_prediction
from official.nlp.tasks import tagging
def override_trainer_cfg(trainer_cfg: cfg.TrainerConfig, learning_rate: float,
num_epoch: int, global_batch_size: int,
warmup_ratio: float, training_data_size: int,
eval_data_size: int, num_eval_per_epoch: int,
best_checkpoint_export_subdir: str,
best_checkpoint_eval_metric: str,
best_checkpoint_metric_comp: str):
"""Overrides a `cfg.TrainerConfig` object."""
steps_per_epoch = training_data_size // global_batch_size
train_steps = steps_per_epoch * num_epoch
# TODO(b/165081095): always set to -1 after the bug is resolved.
if eval_data_size:
eval_steps = int(math.ceil(eval_data_size / global_batch_size))
else:
eval_steps = -1 # exhaust the validation data.
warmp_steps = int(train_steps * warmup_ratio)
validation_interval = steps_per_epoch // num_eval_per_epoch
trainer_cfg.override({
'optimizer_config': {
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'decay_steps': train_steps,
'initial_learning_rate': learning_rate,
'end_learning_rate': 0,
}
},
'optimizer': {
'type': 'adamw',
},
'warmup': {
'polynomial': {
'warmup_steps': warmp_steps,
},
'type': 'polynomial',
},
},
'train_steps': train_steps,
'validation_interval': validation_interval,
'validation_steps': eval_steps,
'best_checkpoint_export_subdir': best_checkpoint_export_subdir,
'best_checkpoint_eval_metric': best_checkpoint_eval_metric,
'best_checkpoint_metric_comp': best_checkpoint_metric_comp,
})
def load_model_config_file(model_config_file: str) -> Dict[str, Any]:
"""Loads bert config json file or `encoders.EncoderConfig` in yaml file."""
if not model_config_file:
# model_config_file may be empty when using tf.hub.
return {}
try:
encoder_config = encoders.EncoderConfig()
encoder_config = hyperparams.override_params_dict(
encoder_config, model_config_file, is_strict=True)
logging.info('Load encoder_config yaml file from %s.', model_config_file)
return encoder_config.as_dict()
except KeyError:
pass
logging.info('Load bert config json file from %s', model_config_file)
with tf.io.gfile.GFile(model_config_file, 'r') as reader:
text = reader.read()
config = json.loads(text)
def get_value(key1, key2):
if key1 in config and key2 in config:
raise ValueError('Unexpected that both %s and %s are in config.' %
(key1, key2))
return config[key1] if key1 in config else config[key2]
def get_value_or_none(key):
return config[key] if key in config else None
# Support both legacy bert_config attributes and the new config attributes.
return {
'bert': {
'attention_dropout_rate':
get_value('attention_dropout_rate',
'attention_probs_dropout_prob'),
'dropout_rate':
get_value('dropout_rate', 'hidden_dropout_prob'),
'hidden_activation':
get_value('hidden_activation', 'hidden_act'),
'hidden_size':
config['hidden_size'],
'embedding_size':
get_value_or_none('embedding_size'),
'initializer_range':
config['initializer_range'],
'intermediate_size':
config['intermediate_size'],
'max_position_embeddings':
config['max_position_embeddings'],
'num_attention_heads':
config['num_attention_heads'],
'num_layers':
get_value('num_layers', 'num_hidden_layers'),
'type_vocab_size':
config['type_vocab_size'],
'vocab_size':
config['vocab_size'],
}
}
def override_sentence_prediction_task_config(
task_cfg: sentence_prediction.SentencePredictionConfig,
model_config_file: str,
init_checkpoint: str,
hub_module_url: str,
global_batch_size: int,
train_input_path: str,
validation_input_path: str,
seq_length: int,
num_classes: int,
metric_type: Optional[str] = 'accuracy',
label_type: Optional[str] = 'int'):
"""Overrides a `SentencePredictionConfig` object."""
task_cfg.override({
'init_checkpoint': init_checkpoint,
'metric_type': metric_type,
'model': {
'num_classes': num_classes,
'encoder': load_model_config_file(model_config_file),
},
'hub_module_url': hub_module_url,
'train_data': {
'drop_remainder': True,
'global_batch_size': global_batch_size,
'input_path': train_input_path,
'is_training': True,
'seq_length': seq_length,
'label_type': label_type,
},
'validation_data': {
'drop_remainder': False,
'global_batch_size': global_batch_size,
'input_path': validation_input_path,
'is_training': False,
'seq_length': seq_length,
'label_type': label_type,
}
})
def override_qa_task_config(
task_cfg: question_answering.QuestionAnsweringConfig,
model_config_file: str, init_checkpoint: str, hub_module_url: str,
global_batch_size: int, train_input_path: str, validation_input_path: str,
seq_length: int, tokenization: str, vocab_file: str, do_lower_case: bool,
version_2_with_negative: bool):
"""Overrides a `QuestionAnsweringConfig` object."""
task_cfg.override({
'init_checkpoint': init_checkpoint,
'model': {
'encoder': load_model_config_file(model_config_file),
},
'hub_module_url': hub_module_url,
'train_data': {
'drop_remainder': True,
'global_batch_size': global_batch_size,
'input_path': train_input_path,
'is_training': True,
'seq_length': seq_length,
},
'validation_data': {
'do_lower_case': do_lower_case,
'drop_remainder': False,
'global_batch_size': global_batch_size,
'input_path': validation_input_path,
'is_training': False,
'seq_length': seq_length,
'tokenization': tokenization,
'version_2_with_negative': version_2_with_negative,
'vocab_file': vocab_file,
}
})
def override_tagging_task_config(task_cfg: tagging.TaggingConfig,
model_config_file: str, init_checkpoint: str,
hub_module_url: str, global_batch_size: int,
train_input_path: str,
validation_input_path: str, seq_length: int,
class_names: List[str]):
"""Overrides a `TaggingConfig` object."""
task_cfg.override({
'init_checkpoint': init_checkpoint,
'model': {
'encoder': load_model_config_file(model_config_file),
},
'hub_module_url': hub_module_url,
'train_data': {
'drop_remainder': True,
'global_batch_size': global_batch_size,
'input_path': train_input_path,
'is_training': True,
'seq_length': seq_length,
},
'validation_data': {
'drop_remainder': False,
'global_batch_size': global_batch_size,
'input_path': validation_input_path,
'is_training': False,
'seq_length': seq_length,
},
'class_names': class_names,
})
def write_glue_classification(task,
model,
input_file,
output_file,
predict_batch_size,
seq_length,
class_names,
label_type='int',
min_float_value=None,
max_float_value=None):
"""Makes classification predictions for glue and writes to output file.
Args:
task: `Task` instance.
model: `keras.Model` instance.
input_file: Input test data file path.
output_file: Output test data file path.
predict_batch_size: Batch size for prediction.
seq_length: Input sequence length.
class_names: List of string class names.
label_type: String denoting label type ('int', 'float'), defaults to 'int'.
min_float_value: If set, predictions will be min-clipped to this value (only
for regression when `label_type` is set to 'float'). Defaults to `None`
(no clipping).
max_float_value: If set, predictions will be max-clipped to this value (only
for regression when `label_type` is set to 'float'). Defaults to `None`
(no clipping).
"""
if label_type not in ('int', 'float'):
raise ValueError('Unsupported `label_type`. Given: %s, expected `int` or '
'`float`.' % label_type)
data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
input_path=input_file,
global_batch_size=predict_batch_size,
is_training=False,
seq_length=seq_length,
label_type=label_type,
drop_remainder=False,
include_example_id=True)
predictions = sentence_prediction.predict(task, data_config, model)
if label_type == 'float':
min_float_value = (-sys.float_info.max
if min_float_value is None else min_float_value)
max_float_value = (
sys.float_info.max if max_float_value is None else max_float_value)
# Clip predictions to range [min_float_value, max_float_value].
predictions = [
min(max(prediction, min_float_value), max_float_value)
for prediction in predictions
]
with tf.io.gfile.GFile(output_file, 'w') as writer:
writer.write('index\tprediction\n')
for index, prediction in enumerate(predictions):
if label_type == 'float':
# Regression.
writer.write('%d\t%.3f\n' % (index, prediction))
else:
# Classification.
writer.write('%d\t%s\n' % (index, class_names[prediction]))
def write_xtreme_classification(task,
model,
input_file,
output_file,
predict_batch_size,
seq_length,
class_names,
translated_input_file=None,
test_time_aug_wgt=0.3):
"""Makes classification predictions for xtreme and writes to output file."""
data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
input_path=input_file,
seq_length=seq_length,
is_training=False,
label_type='int',
global_batch_size=predict_batch_size,
drop_remainder=False,
include_example_id=True)
if translated_input_file is not None:
data_config_aug = (
sentence_prediction_dataloader.SentencePredictionDataConfig(
input_path=translated_input_file,
seq_length=seq_length,
is_training=False,
label_type='int',
global_batch_size=predict_batch_size,
drop_remainder=False,
include_example_id=True))
else:
data_config_aug = None
predictions = sentence_prediction.predict(task, data_config, model,
data_config_aug, test_time_aug_wgt)
with tf.io.gfile.GFile(output_file, 'w') as writer:
for prediction in predictions:
writer.write('%s\n' % class_names[prediction])
def write_question_answering(task,
model,
input_file,
output_file,
predict_batch_size,
seq_length,
tokenization,
vocab_file,
do_lower_case,
version_2_with_negative=False):
"""Makes question answering predictions and writes to output file."""
data_config = question_answering_dataloader.QADataConfig(
do_lower_case=do_lower_case,
doc_stride=128,
drop_remainder=False,
global_batch_size=predict_batch_size,
input_path=input_file,
is_training=False,
query_length=64,
seq_length=seq_length,
tokenization=tokenization,
version_2_with_negative=version_2_with_negative,
vocab_file=vocab_file)
all_predictions, _, _ = question_answering.predict(task, data_config, model)
with tf.io.gfile.GFile(output_file, 'w') as writer:
writer.write(json.dumps(all_predictions, indent=4) + '\n')
def write_tagging(task, model, input_file, output_file, predict_batch_size,
seq_length):
"""Makes tagging predictions and writes to output file."""
data_config = tagging_dataloader.TaggingDataConfig(
input_path=input_file,
is_training=False,
seq_length=seq_length,
global_batch_size=predict_batch_size,
drop_remainder=False,
include_sentence_id=True)
results = tagging.predict(task, data_config, model)
class_names = task.task_config.class_names
last_sentence_id = -1
with tf.io.gfile.GFile(output_file, 'w') as writer:
for sentence_id, _, predict_ids in results:
token_labels = [class_names[x] for x in predict_ids]
assert sentence_id == last_sentence_id or (
sentence_id == last_sentence_id + 1)
if sentence_id != last_sentence_id and last_sentence_id != -1:
writer.write('\n')
writer.write('\n'.join(token_labels))
writer.write('\n')
last_sentence_id = sentence_id
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common flags for GLUE finetuning binary."""
from typing import Callable
from absl import flags
from absl import logging
def define_flags():
"""Defines flags."""
# ===========================================================================
# Glue binary flags.
# ===========================================================================
flags.DEFINE_enum(
'mode', 'train_eval_and_predict',
['train_eval_and_predict', 'train_eval', 'predict'],
'The mode to run the binary. If `train_eval_and_predict` '
'it will (1) train on the training data and (2) evaluate on '
'the validation data and (3) finally generate predictions '
'on the prediction data; if `train_eval`, it will only '
'run training and evaluation; if `predict`, it will only '
'run prediction using the model in `model_dir`.')
flags.DEFINE_enum('task_name', None, [
'AX', 'COLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B',
'WNLI'
], 'The type of GLUE task.')
flags.DEFINE_string('train_input_path', None,
'The file path to the training data.')
flags.DEFINE_string('validation_input_path', None,
'The file path to the evaluation data.')
flags.DEFINE_string('test_input_path', None,
'The file path to the test input data.')
flags.DEFINE_string('test_output_path', None,
'The file path to the test output data.')
flags.DEFINE_string('model_dir', '', 'The model directory containing '
'subdirectories for each task. Only needed for "predict" '
'mode. For all other modes, if not provided, a unique '
'directory will be created automatically for each run.')
flags.DEFINE_string(
'input_meta_data_path', None, 'Path to file that contains '
'metadata about input file. It is output by the `create_finetuning_data` '
'binary. Required for all modes except "predict".')
flags.DEFINE_string('init_checkpoint', '',
'Initial checkpoint from a pre-trained BERT model.')
flags.DEFINE_string(
'model_config_file', '', 'The config file specifying the architecture '
'of the pre-trained model. The file can be either a bert_config.json '
'file or `encoders.EncoderConfig` in yaml file.')
flags.DEFINE_string(
'hub_module_url', '', 'TF-Hub path/url to a pretrained model. If '
'specified, `init_checkpoint` and `model_config_file` flag should not be '
'used.')
flags.DEFINE_multi_string('gin_file', None,
'List of paths to the gin config files.')
flags.DEFINE_multi_string('gin_params', None,
'Newline separated list of gin parameter bindings.')
flags.DEFINE_multi_string(
'config_file', None, 'This is the advanced usage to specify the '
'`ExperimentConfig` directly. When specified, '
'we will ignore FLAGS related to `ExperimentConfig` such as '
'`train_input_path`, `validation_input_path` and following hparams.')
# ===========================================================================
# Tuning hparams.
# ===========================================================================
flags.DEFINE_integer('global_batch_size', 32,
'Global batch size for train/eval/predict.')
flags.DEFINE_float('learning_rate', 3e-5, 'Initial learning rate.')
flags.DEFINE_integer('num_epoch', 3, 'Number of training epochs.')
flags.DEFINE_float('warmup_ratio', 0.1,
'Proportion of learning rate warmup steps.')
flags.DEFINE_integer('num_eval_per_epoch', 2,
'Number of evaluations to run per epoch.')
def validate_flags(flags_obj: flags.FlagValues,
file_exists_fn: Callable[[str], bool]):
"""Raises ValueError if any flags are misconfigured.
Args:
flags_obj: A `flags.FlagValues` object, usually from `flags.FLAG`.
file_exists_fn: A callable to decide if a file path exists or not.
"""
def _check_path_exists(flag_path, flag_name):
if not file_exists_fn(flag_path):
raise ValueError('Flag `%s` at %s does not exist.' %
(flag_name, flag_path))
def _validate_path(flag_path, flag_name):
if not flag_path:
raise ValueError('Flag `%s` must be provided in mode %s.' %
(flag_name, flags_obj.mode))
_check_path_exists(flag_path, flag_name)
if 'train' in flags_obj.mode:
_validate_path(flags_obj.train_input_path, 'train_input_path')
_validate_path(flags_obj.input_meta_data_path, 'input_meta_data_path')
if flags_obj.gin_file:
for gin_file in flags_obj.gin_file:
_check_path_exists(gin_file, 'gin_file')
if flags_obj.config_file:
for config_file in flags_obj.config_file:
_check_path_exists(config_file, 'config_file')
if 'eval' in flags_obj.mode:
_validate_path(flags_obj.validation_input_path, 'validation_input_path')
if flags_obj.mode == 'predict':
# model_dir is only needed strictly in 'predict' mode.
_validate_path(flags_obj.model_dir, 'model_dir')
if 'predict' in flags_obj.mode:
_validate_path(flags_obj.test_input_path, 'test_input_path')
if not flags_obj.config_file and flags_obj.mode != 'predict':
if flags_obj.hub_module_url:
if flags_obj.init_checkpoint or flags_obj.model_config_file:
raise ValueError(
'When `hub_module_url` is specified, `init_checkpoint` and '
'`model_config_file` should be empty.')
logging.info(
'Using the pretrained tf.hub from %s', flags_obj.hub_module_url)
else:
if not (flags_obj.init_checkpoint and flags_obj.model_config_file):
raise ValueError('Both `init_checkpoint` and `model_config_file` '
'should be specified if `config_file` is not '
'specified.')
_validate_path(flags_obj.model_config_file, 'model_config_file')
logging.info(
'Using the pretrained checkpoint from %s and model_config_file from '
'%s.', flags_obj.init_checkpoint, flags_obj.model_config_file)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Runs prediction to generate submission files for GLUE tasks."""
import functools
import json
import os
import pprint
from absl import app
from absl import flags
from absl import logging
import gin
import tensorflow as tf
from official.common import distribute_utils
# Imports registered experiment configs.
from official.common import registry_imports # pylint: disable=unused-import
from official.core import exp_factory
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling.hyperparams import params_dict
from official.nlp.finetuning import binary_helper
from official.nlp.finetuning.glue import flags as glue_flags
# Device configs.
flags.DEFINE_string('distribution_strategy', 'tpu',
'The Distribution Strategy to use for training.')
flags.DEFINE_string(
'tpu', '',
'The Cloud TPU to use for training. This should be either the name '
'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.')
flags.DEFINE_integer('num_gpus', 1, 'The number of GPUs to use at each worker.')
FLAGS = flags.FLAGS
EXPERIMENT_TYPE = 'bert/sentence_prediction'
BEST_CHECKPOINT_EXPORT_SUBDIR = 'best_ckpt'
EVAL_METRIC_MAP = {
'AX': 'matthews_corrcoef',
'COLA': 'matthews_corrcoef',
'MNLI': 'cls_accuracy',
'MRPC': 'cls_accuracy',
'QNLI': 'cls_accuracy',
'QQP': 'cls_accuracy',
'RTE': 'cls_accuracy',
'SST-2': 'cls_accuracy',
'STS-B': 'pearson_spearman_corr',
'WNLI': 'cls_accuracy',
}
AX_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
COLA_CLASS_NAMES = ['0', '1']
MNLI_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
MRPC_CLASS_NAMES = ['0', '1']
QNLI_CLASS_NAMES = ['entailment', 'not_entailment']
QQP_CLASS_NAMES = ['0', '1']
RTE_CLASS_NAMES = ['entailment', 'not_entailment']
SST_2_CLASS_NAMES = ['0', '1']
WNLI_CLASS_NAMES = ['0', '1']
def _override_exp_config_by_file(exp_config, exp_config_files):
"""Overrides an `ExperimentConfig` object by files."""
for exp_config_file in exp_config_files:
if not tf.io.gfile.exists(exp_config_file):
raise ValueError('%s does not exist.' % exp_config_file)
params_dict.override_params_dict(
exp_config, exp_config_file, is_strict=True)
return exp_config
def _override_exp_config_by_flags(exp_config, input_meta_data):
"""Overrides an `ExperimentConfig` object by flags."""
if FLAGS.task_name in ('AX', 'COLA',):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
num_classes=input_meta_data['num_labels'],
metric_type='matthews_corrcoef')
elif FLAGS.task_name in ('MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2',
'WNLI'):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
num_classes=input_meta_data['num_labels'])
elif FLAGS.task_name in ('STS-B',):
override_task_cfg_fn = functools.partial(
binary_helper.override_sentence_prediction_task_config,
num_classes=1,
metric_type='pearson_spearman_corr',
label_type='float')
else:
raise ValueError('Task %s not supported.' % FLAGS.task_name)
binary_helper.override_trainer_cfg(
exp_config.trainer,
learning_rate=FLAGS.learning_rate,
num_epoch=FLAGS.num_epoch,
global_batch_size=FLAGS.global_batch_size,
warmup_ratio=FLAGS.warmup_ratio,
training_data_size=input_meta_data['train_data_size'],
eval_data_size=input_meta_data['eval_data_size'],
num_eval_per_epoch=FLAGS.num_eval_per_epoch,
best_checkpoint_export_subdir=BEST_CHECKPOINT_EXPORT_SUBDIR,
best_checkpoint_eval_metric=EVAL_METRIC_MAP[FLAGS.task_name],
best_checkpoint_metric_comp='higher')
override_task_cfg_fn(
exp_config.task,
model_config_file=FLAGS.model_config_file,
init_checkpoint=FLAGS.init_checkpoint,
hub_module_url=FLAGS.hub_module_url,
global_batch_size=FLAGS.global_batch_size,
train_input_path=FLAGS.train_input_path,
validation_input_path=FLAGS.validation_input_path,
seq_length=input_meta_data['max_seq_length'])
return exp_config
def _get_exp_config(input_meta_data, exp_config_files):
"""Gets an `ExperimentConfig` object."""
exp_config = exp_factory.get_exp_config(EXPERIMENT_TYPE)
if exp_config_files:
logging.info(
'Loading `ExperimentConfig` from file, and flags will be ignored.')
exp_config = _override_exp_config_by_file(exp_config, exp_config_files)
else:
logging.info('Loading `ExperimentConfig` from flags.')
exp_config = _override_exp_config_by_flags(exp_config, input_meta_data)
exp_config.validate()
exp_config.lock()
pp = pprint.PrettyPrinter()
logging.info('Final experiment parameters: %s',
pp.pformat(exp_config.as_dict()))
return exp_config
def _write_submission_file(task, seq_length):
"""Writes submission files that can be uploaded to the leaderboard."""
tf.io.gfile.makedirs(os.path.dirname(FLAGS.test_output_path))
model = task.build_model()
ckpt_file = tf.train.latest_checkpoint(
os.path.join(FLAGS.model_dir, BEST_CHECKPOINT_EXPORT_SUBDIR))
logging.info('Restoring checkpoints from %s', ckpt_file)
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.read(ckpt_file).expect_partial()
write_fn = binary_helper.write_glue_classification
write_fn_map = {
'AX':
functools.partial(
write_fn, class_names=AX_CLASS_NAMES),
'COLA':
functools.partial(
write_fn, class_names=COLA_CLASS_NAMES),
'MNLI':
functools.partial(
write_fn, class_names=MNLI_CLASS_NAMES),
'MRPC':
functools.partial(
write_fn, class_names=MRPC_CLASS_NAMES),
'QNLI':
functools.partial(
write_fn, class_names=QNLI_CLASS_NAMES),
'QQP':
functools.partial(
write_fn, class_names=QQP_CLASS_NAMES),
'RTE':
functools.partial(
write_fn, class_names=RTE_CLASS_NAMES),
'SST-2':
functools.partial(
write_fn, class_names=SST_2_CLASS_NAMES),
'STS-B':
# No class_names (regression), clip predictions to [0.0, 5.0] per glue
# benchmark grader.
functools.partial(
write_fn, class_names=None, label_type='float',
min_float_value=0.0, max_float_value=5.0),
'WNLI':
functools.partial(
write_fn, class_names=WNLI_CLASS_NAMES),
}
logging.info('Predicting %s', FLAGS.test_input_path)
write_fn_map[FLAGS.task_name](
task=task,
model=model,
input_file=FLAGS.test_input_path,
output_file=FLAGS.test_output_path,
predict_batch_size=(
task.task_config.train_data.global_batch_size),
seq_length=seq_length)
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
glue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists)
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
distribution_strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=FLAGS.distribution_strategy,
num_gpus=FLAGS.num_gpus,
tpu_address=FLAGS.tpu)
with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
input_meta_data = json.loads(reader.read().decode('utf-8'))
with distribution_strategy.scope():
task = None
if 'train_eval' in FLAGS.mode:
logging.info('Starting training and eval...')
logging.info('Model dir: %s', FLAGS.model_dir)
exp_config = _get_exp_config(
input_meta_data=input_meta_data,
exp_config_files=FLAGS.config_file)
train_utils.serialize_config(exp_config, FLAGS.model_dir)
task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir)
train_lib.run_experiment(
distribution_strategy=distribution_strategy,
task=task,
mode='train_and_eval',
params=exp_config,
model_dir=FLAGS.model_dir)
if 'predict' in FLAGS.mode:
logging.info('Starting predict...')
# When mode is `predict`, `task` will be None.
if task is None:
exp_config = _get_exp_config(
input_meta_data=input_meta_data,
exp_config_files=[os.path.join(FLAGS.model_dir, 'params.yaml')])
task = task_factory.get_task(
exp_config.task, logging_dir=FLAGS.model_dir)
_write_submission_file(task, input_meta_data['max_seq_length'])
if __name__ == '__main__':
glue_flags.define_flags()
flags.mark_flag_as_required('mode')
flags.mark_flag_as_required('task_name')
app.run(main)
# keras-nlp
## Layers
Layers are the fundamental building blocks for NLP models. They can be used to
assemble new layers, networks, or models.
* [TransformerEncoderBlock](layers/transformer_encoder_block.py) implements
an optionally masked transformer as described in
["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
* [OnDeviceEmbedding](layers/on_device_embedding.py) implements efficient
embedding lookups designed for TPU-based models.
* [PositionalEmbedding](layers/position_embedding.py) creates a positional
embedding as described in ["BERT: Pre-training of Deep Bidirectional
Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
* [SelfAttentionMask](layers/self_attention_mask.py) creates a 3D attention
mask from a 2D tensor mask.
* [MaskedLM](layers/masked_lm.py) implements a masked language model. It
assumes the embedding table variable is passed to it.
## Encoders
Encoders are combinations of layers (and possibly other encoders). They are
sub-units of models that would not be trained alone. It encapsulates common
network structures like a classification head or a transformer encoder into an
easily handled object with a standardized configuration.
* [BertEncoder](encoders/bert_encoder.py) implements a bi-directional
Transformer-based encoder as described in
["BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding
lookups, transformer layers and pooling layer.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-NLP package definition."""
# pylint: disable=wildcard-import
from official.nlp.keras_nlp import encoders
from official.nlp.keras_nlp import layers
## Contributing to KerasNLP
Patches to KerasNLP are welcome!
The source-of-truth repository lives under
[TF Model Garden NLP](https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp),
and is mirrored as a read-only repository under
[keras-team/keras-nlp](https://github.com/keras-team/keras-nlp).
Contributions should be made as PRs to the TF Model Garden repository.
This is to ensure the codebase is rigorously tested with state-of-art models
on different accelerators.
In the long run, we will move development to the current repository `keras-team/keras-nlp`.
## :heavy_check_mark: Contributor checklist
1. Ensure you have signed the [Contributor License Agreement](https://cla.developers.google.com/about/google-individual?csw=1).
* All code contributors are required to sign a Contributor License Agreement.
* Please read this [troubleshooting guide](Contributor-License-Agreements#troubleshooting-clas)
if you encounter an issue.
2. Please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
3. Check if your changes are consistent with the [TensorFlow coding style](https://www.tensorflow.org/community/contribute/code_style).
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-NLP layers package definition."""
from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Bert encoder network."""
# pylint: disable=g-classes-have-attributes
import collections
from absl import logging
import tensorflow as tf
from official.nlp.keras_nlp import layers
@tf.keras.utils.register_keras_serializable(package='keras_nlp')
class BertEncoder(tf.keras.Model):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
*Note* that the network is constructed by
[Keras Functional API](https://keras.io/guides/functional_api/).
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs.
"""
def __init__(
self,
vocab_size,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_sequence_length=512,
type_vocab_size=16,
inner_dim=3072,
inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
output_dropout=0.1,
attention_dropout=0.1,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
output_range=None,
embedding_width=None,
embedding_layer=None,
**kwargs):
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
word_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_word_ids')
mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
if embedding_width is None:
embedding_width = hidden_size
if embedding_layer is None:
embedding_layer_inst = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
embedding_layer_inst = embedding_layer
word_embeddings = embedding_layer_inst(word_ids)
# Always uses dynamic slicing for simplicity.
position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
position_embeddings = position_embedding_layer(word_embeddings)
type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
type_embeddings = type_embedding_layer(type_ids)
embeddings = tf.keras.layers.Add()(
[word_embeddings, position_embeddings, type_embeddings])
embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
embeddings = embedding_norm_layer(embeddings)
embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if embedding_width != hidden_size:
embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
embeddings = embedding_projection(embeddings)
else:
embedding_projection = None
transformer_layers = []
data = embeddings
attention_mask = layers.SelfAttentionMask()(data, mask)
encoder_outputs = []
for i in range(num_layers):
if i == num_layers - 1 and output_range is not None:
transformer_output_range = output_range
else:
transformer_output_range = None
layer = layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads,
inner_dim=inner_dim,
inner_activation=inner_activation,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
output_range=transformer_output_range,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
transformer_layers.append(layer)
data = layer([data, attention_mask])
encoder_outputs.append(data)
last_encoder_output = encoder_outputs[-1]
# Applying a tf.slice op (through subscript notation) to a Keras tensor
# like this will create a SliceOpLambda layer. This is better than a Lambda
# layer with Python code, because that is fundamentally less portable.
first_token_tensor = last_encoder_output[:, 0, :]
pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
cls_output = pooler_layer(first_token_tensor)
outputs = dict(
sequence_output=encoder_outputs[-1],
pooled_output=cls_output,
encoder_outputs=encoder_outputs,
)
# Once we've created the network using the Functional API, we call
# super().__init__ as though we were invoking the Functional API Model
# constructor, resulting in this object having all the properties of a model
# created using the Functional API. Once super().__init__ is called, we
# can assign attributes to `self` - note that all `self` assignments are
# below this line.
super(BertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
config_dict = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
# compatibility with an earlier version of this model which did not track
# the config dict attribute. TF does not track immutable attrs which
# do not contain Trackables, so by creating a config namedtuple instead of
# a dict we avoid tracking it.
config_cls = collections.namedtuple('Config', config_dict.keys())
self._config = config_cls(**config_dict)
self._pooler_layer = pooler_layer
self._transformer_layers = transformer_layers
self._embedding_norm_layer = embedding_norm_layer
self._embedding_layer = embedding_layer_inst
self._position_embedding_layer = position_embedding_layer
self._type_embedding_layer = type_embedding_layer
if embedding_projection is not None:
self._embedding_projection = embedding_projection
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return dict(self._config._asdict())
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
if 'embedding_layer' in config and config['embedding_layer'] is not None:
warn_string = (
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.')
print('WARNING: ' + warn_string)
logging.warn(warn_string)
return cls(**config)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for transformer-based bert encoder network."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.keras_nlp.encoders import bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BertEncoderTest(keras_parameterized.TestCase):
def tearDown(self):
super(BertEncoderTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
def test_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
self.assertIsInstance(test_network.transformer_layers, list)
self.assertLen(test_network.transformer_layers, 3)
self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_all_encoder_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
all_encoder_outputs = dict_outputs["encoder_outputs"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertLen(all_encoder_outputs, 3)
for data in all_encoder_outputs:
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_network_creation_with_float16_dtype(self):
hidden_size = 32
sequence_length = 21
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters(
("all_sequence", None, 21),
("output_range", 1, 1),
)
def test_network_invocation(self, output_range, out_seq_len):
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=output_range)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
# Create a model based off of this network:
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size = 3
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], out_seq_len)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length = 128
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], sequence_length)
# Creates a BertEncoder with embedding_width != hidden_size
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
embedding_width=16)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[-1], hidden_size)
self.assertTrue(hasattr(test_network, "_embedding_projection"))
def test_serialize_deserialize(self):
# Create a network object that sets all of its config options.
kwargs = dict(
vocab_size=100,
hidden_size=32,
num_layers=3,
num_attention_heads=2,
max_sequence_length=21,
type_vocab_size=12,
inner_dim=1223,
inner_activation="relu",
output_dropout=0.05,
attention_dropout=0.22,
initializer="glorot_uniform",
output_range=-1,
embedding_width=16,
embedding_layer=None)
network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize(
tf.keras.activations.get(expected_config["inner_activation"]))
expected_config["initializer"] = tf.keras.initializers.serialize(
tf.keras.initializers.get(expected_config["initializer"]))
self.assertEqual(network.get_config(), expected_config)
# Create another network object from the first object's config.
new_network = bert_encoder.BertEncoder.from_config(network.get_config())
# Validate that the config can be forced to JSON.
_ = network.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(network.get_config(), new_network.get_config())
# Tests model saving/loading.
model_path = self.get_temp_dir() + "/model"
network.save(model_path)
_ = tf.keras.models.load_model(model_path)
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-NLP layers package definition."""
from official.nlp.keras_nlp.layers.masked_lm import MaskedLM
from official.nlp.keras_nlp.layers.on_device_embedding import OnDeviceEmbedding
from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
from official.nlp.keras_nlp.layers.self_attention_mask import SelfAttentionMask
from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='keras_nlp')
class MaskedLM(tf.keras.layers.Layer):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method.
Example:
```python
encoder=keras_nlp.BertEncoder(...)
lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
```
Args:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either 'logits' or
'predictions'.
"""
def __init__(self,
embedding_table,
activation=None,
initializer='glorot_uniform',
output='logits',
name=None,
**kwargs):
super(MaskedLM, self).__init__(name=name, **kwargs)
self.embedding_table = embedding_table
self.activation = activation
self.initializer = tf.keras.initializers.get(initializer)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self._vocab_size, hidden_size = self.embedding_table.shape
self.dense = tf.keras.layers.Dense(
hidden_size,
activation=self.activation,
kernel_initializer=self.initializer,
name='transform/dense')
self.layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='transform/LayerNorm')
self.bias = self.add_weight(
'output_bias/bias',
shape=(self._vocab_size,),
initializer='zeros',
trainable=True)
super(MaskedLM, self).build(input_shape)
def call(self, sequence_data, masked_positions):
masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
lm_data = self.dense(masked_lm_input)
lm_data = self.layer_norm(lm_data)
lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
logits = tf.nn.bias_add(lm_data, self.bias)
masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
masked_positions)[1]
logits = tf.reshape(logits,
[-1, masked_positions_length, self._vocab_size])
if self._output_type == 'logits':
return logits
return tf.nn.log_softmax(logits)
def get_config(self):
raise NotImplementedError('MaskedLM cannot be directly serialized because '
'it has variable sharing logic.')
def _gather_indexes(self, sequence_tensor, positions):
"""Gathers the vectors at the specific positions, for performance.
Args:
sequence_tensor: Sequence output of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape = tf.shape(sequence_tensor)
batch_size, seq_length = sequence_shape[0], sequence_shape[1]
width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="keras_nlp")
class OnDeviceEmbedding(tf.keras.layers.Layer):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Args:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def __init__(self,
vocab_size,
embedding_width,
initializer="glorot_uniform",
use_one_hot=False,
scale_factor=None,
**kwargs):
super(OnDeviceEmbedding, self).__init__(**kwargs)
self._vocab_size = vocab_size
self._embedding_width = embedding_width
self._initializer = initializer
self._use_one_hot = use_one_hot
self._scale_factor = scale_factor
def get_config(self):
config = {
"vocab_size": self._vocab_size,
"embedding_width": self._embedding_width,
"initializer": self._initializer,
"use_one_hot": self._use_one_hot,
"scale_factor": self._scale_factor,
}
base_config = super(OnDeviceEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self.embeddings = self.add_weight(
"embeddings",
shape=[self._vocab_size, self._embedding_width],
initializer=self._initializer,
dtype=tf.float32)
super(OnDeviceEmbedding, self).build(input_shape)
def call(self, inputs):
flat_inputs = tf.reshape(inputs, [-1])
if self._use_one_hot:
dtype = self._compute_dtype
if not tf.dtypes.as_dtype(dtype).is_floating:
# TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
# instead of a floating-point dtype, as the dtype is inferred from the
# dtype of the inputs
dtype = tf.float32
one_hot_data = tf.one_hot(
flat_inputs, depth=self._vocab_size, dtype=dtype)
embeddings = tf.matmul(one_hot_data, self.embeddings)
else:
embeddings = tf.gather(self.embeddings, flat_inputs)
embeddings = tf.reshape(
embeddings,
# Work around b/142213824: prefer concat to shape over a Python list.
tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
if self._scale_factor:
embeddings *= self._scale_factor
return embeddings
@property
def vocab_size(self):
return self._vocab_size
@property
def embedding_width(self):
return self._embedding_width
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,18 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based one-hot embedding layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for Keras-based one-hot embedding layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import on_device_embedding
from official.nlp.keras_nlp.layers import on_device_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
......@@ -49,9 +45,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
def test_layer_creation_with_mixed_precision(self):
vocab_size = 31
embedding_width = 27
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size, embedding_width=embedding_width, dtype=policy)
vocab_size=vocab_size, embedding_width=embedding_width,
dtype="mixed_float16")
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
......@@ -87,10 +83,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
def test_layer_invocation_with_mixed_precision(self):
vocab_size = 31
embedding_width = 27
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size, embedding_width=embedding_width,
dtype=policy)
dtype="mixed_float16")
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
......@@ -128,11 +123,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
def test_one_hot_layer_creation_with_mixed_precision(self):
vocab_size = 31
embedding_width = 27
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
dtype=policy,
dtype="mixed_float16",
use_one_hot=True)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
......@@ -171,11 +165,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
def test_one_hot_layer_invocation_with_mixed_precision(self):
vocab_size = 31
embedding_width = 27
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
dtype=policy,
dtype="mixed_float16",
use_one_hot=True)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
......@@ -193,6 +186,28 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
output = model.predict(input_data)
self.assertEqual(tf.float16, output.dtype)
def test_use_scale_layer_invocation(self):
vocab_size = 31
embedding_width = 27
test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size, embedding_width=embedding_width,
scale_factor=embedding_width**0.5)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
output_tensor = test_layer(input_tensor)
# Create a model from the test layer.
model = tf.keras.Model(input_tensor, output_tensor)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 3
input_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
output = model.predict(input_data)
self.assertEqual(tf.float32, output.dtype)
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based positional embedding layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="keras_nlp")
class PositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding.
Example:
```python
position_embedding = PositionEmbedding(max_length=100)
inputs = tf.keras.Input((100, 32), dtype=tf.float32)
outputs = position_embedding(inputs)
```
Args:
max_length: The maximum size of the dynamic sequence.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
Reference: This layer creates a positional embedding as described in
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805).
"""
def __init__(self,
max_length,
initializer="glorot_uniform",
**kwargs):
super(PositionEmbedding, self).__init__(**kwargs)
if max_length is None:
raise ValueError(
"`max_length` must be an Integer, not `None`."
)
self._max_length = max_length
self._initializer = tf.keras.initializers.get(initializer)
def get_config(self):
config = {
"max_length": self._max_length,
"initializer": tf.keras.initializers.serialize(self._initializer),
}
base_config = super(PositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
dimension_list = input_shape.as_list()
if len(dimension_list) != 3:
raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
"of shape [batch, sequence, width], got "
"{}".format(input_shape))
seq_length = dimension_list[1]
width = dimension_list[2]
if self._max_length is not None:
weight_sequence_length = self._max_length
else:
weight_sequence_length = seq_length
self._position_embeddings = self.add_weight(
"embeddings",
shape=[weight_sequence_length, width],
initializer=self._initializer)
super(PositionEmbedding, self).build(input_shape)
def call(self, inputs):
input_shape = tf.shape(inputs)
position_embeddings = self._position_embeddings[:input_shape[1], :]
return tf.broadcast_to(position_embeddings, input_shape)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment