Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
2e9bb539
Commit
2e9bb539
authored
Feb 25, 2021
by
stephenwu
Browse files
Merge branch 'master' of
https://github.com/tensorflow/models
into RTESuperGLUE
parents
7bae5317
8fba84f8
Changes
121
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
349 additions
and
41 deletions
+349
-41
official/nlp/continuous_finetune_lib.py
official/nlp/continuous_finetune_lib.py
+5
-0
official/nlp/continuous_finetune_lib_test.py
official/nlp/continuous_finetune_lib_test.py
+3
-0
official/nlp/data/classifier_data_lib.py
official/nlp/data/classifier_data_lib.py
+7
-5
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+4
-4
official/nlp/data/sentence_prediction_dataloader_test.py
official/nlp/data/sentence_prediction_dataloader_test.py
+3
-0
official/nlp/data/wmt_dataloader.py
official/nlp/data/wmt_dataloader.py
+3
-3
official/nlp/docs/tfhub.md
official/nlp/docs/tfhub.md
+292
-0
official/nlp/modeling/layers/attention.py
official/nlp/modeling/layers/attention.py
+1
-1
official/nlp/modeling/layers/dense_einsum.py
official/nlp/modeling/layers/dense_einsum.py
+1
-1
official/nlp/modeling/layers/gated_feedforward.py
official/nlp/modeling/layers/gated_feedforward.py
+4
-4
official/nlp/modeling/layers/masked_softmax.py
official/nlp/modeling/layers/masked_softmax.py
+1
-1
official/nlp/modeling/layers/relative_attention.py
official/nlp/modeling/layers/relative_attention.py
+4
-3
official/nlp/modeling/layers/self_attention_mask.py
official/nlp/modeling/layers/self_attention_mask.py
+1
-1
official/nlp/modeling/layers/talking_heads_attention.py
official/nlp/modeling/layers/talking_heads_attention.py
+1
-1
official/nlp/modeling/layers/text_layers.py
official/nlp/modeling/layers/text_layers.py
+15
-13
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+1
-1
official/nlp/modeling/models/bert_pretrainer.py
official/nlp/modeling/models/bert_pretrainer.py
+1
-1
official/nlp/modeling/models/electra_pretrainer.py
official/nlp/modeling/models/electra_pretrainer.py
+1
-1
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+1
-0
official/nlp/modeling/networks/encoder_scaffold.py
official/nlp/modeling/networks/encoder_scaffold.py
+0
-1
No files found.
official/nlp/continuous_finetune_lib.py
View file @
2e9bb539
...
@@ -145,6 +145,11 @@ def run_continuous_finetune(
...
@@ -145,6 +145,11 @@ def run_continuous_finetune(
min_interval_secs
=
10
,
min_interval_secs
=
10
,
timeout
=
params
.
trainer
.
continuous_eval_timeout
,
timeout
=
params
.
trainer
.
continuous_eval_timeout
,
timeout_fn
=
timeout_fn
):
timeout_fn
=
timeout_fn
):
# If there are checkpoints, they might be the finetune checkpoint of a
# different pretrained checkpoint. So we just remove all checkpoints.
train_utils
.
remove_ckpts
(
model_dir
)
with
distribution_strategy
.
scope
():
with
distribution_strategy
.
scope
():
global_step
=
train_utils
.
read_global_step_from_checkpoint
(
pretrain_ckpt
)
global_step
=
train_utils
.
read_global_step_from_checkpoint
(
pretrain_ckpt
)
# Replaces params.task.init_checkpoint to make sure that we load
# Replaces params.task.init_checkpoint to make sure that we load
...
...
official/nlp/continuous_finetune_lib_test.py
View file @
2e9bb539
...
@@ -90,6 +90,9 @@ class ContinuousFinetuneTest(tf.test.TestCase, parameterized.TestCase):
...
@@ -90,6 +90,9 @@ class ContinuousFinetuneTest(tf.test.TestCase, parameterized.TestCase):
pretrain_steps
=
pretrain_steps
)
pretrain_steps
=
pretrain_steps
)
self
.
assertIn
(
'best_acc'
,
eval_metrics
)
self
.
assertIn
(
'best_acc'
,
eval_metrics
)
self
.
assertFalse
(
tf
.
io
.
gfile
.
exists
(
os
.
path
.
join
(
FLAGS
.
model_dir
,
'checkpoint'
)))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
tf
.
test
.
main
()
official/nlp/data/classifier_data_lib.py
View file @
2e9bb539
...
@@ -1307,7 +1307,7 @@ class AXgProcessor(DataProcessor):
...
@@ -1307,7 +1307,7 @@ class AXgProcessor(DataProcessor):
"""Creates examples for the training/dev/test sets."""
"""Creates examples for the training/dev/test sets."""
examples
=
[]
examples
=
[]
for
line
in
lines
:
for
line
in
lines
:
guid
=
"%s-%s"
%
(
set_type
,
self
.
process_text_fn
(
str
(
line
[
'
idx
'
])))
guid
=
"%s-%s"
%
(
set_type
,
self
.
process_text_fn
(
str
(
line
[
"
idx
"
])))
text_a
=
self
.
process_text_fn
(
line
[
"premise"
])
text_a
=
self
.
process_text_fn
(
line
[
"premise"
])
text_b
=
self
.
process_text_fn
(
line
[
"hypothesis"
])
text_b
=
self
.
process_text_fn
(
line
[
"hypothesis"
])
label
=
self
.
process_text_fn
(
line
[
"label"
])
label
=
self
.
process_text_fn
(
line
[
"label"
])
...
@@ -1315,7 +1315,8 @@ class AXgProcessor(DataProcessor):
...
@@ -1315,7 +1315,8 @@ class AXgProcessor(DataProcessor):
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
return
examples
class
RTESuperGLUEProcessor
(
DataProcessor
):
class
SuperGLUERTEProcessor
(
DataProcessor
):
"""Processor for the RTE dataset (SuperGLUE version)."""
"""Processor for the RTE dataset (SuperGLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
def
get_train_examples
(
self
,
data_dir
):
...
@@ -1349,16 +1350,17 @@ class RTESuperGLUEProcessor(DataProcessor):
...
@@ -1349,16 +1350,17 @@ class RTESuperGLUEProcessor(DataProcessor):
examples
=
[]
examples
=
[]
for
i
,
line
in
enumerate
(
lines
):
for
i
,
line
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
self
.
process_text_fn
(
line
[
'
premise
'
])
text_a
=
self
.
process_text_fn
(
line
[
"
premise
"
])
text_b
=
self
.
process_text_fn
(
line
[
'
hypothesis
'
])
text_b
=
self
.
process_text_fn
(
line
[
"
hypothesis
"
])
if
set_type
==
"test"
:
if
set_type
==
"test"
:
label
=
"entailment"
label
=
"entailment"
else
:
else
:
label
=
self
.
process_text_fn
(
line
[
'
label
'
])
label
=
self
.
process_text_fn
(
line
[
"
label
"
])
examples
.
append
(
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
return
examples
def
file_based_convert_examples_to_features
(
examples
,
def
file_based_convert_examples_to_features
(
examples
,
label_list
,
label_list
,
max_seq_length
,
max_seq_length
,
...
...
official/nlp/data/create_finetuning_data.py
View file @
2e9bb539
...
@@ -49,8 +49,8 @@ flags.DEFINE_string(
...
@@ -49,8 +49,8 @@ flags.DEFINE_string(
flags
.
DEFINE_enum
(
flags
.
DEFINE_enum
(
"classification_task_name"
,
"MNLI"
,
[
"classification_task_name"
,
"MNLI"
,
[
"AX"
,
"COLA"
,
"IMDB"
,
"MNLI"
,
"MRPC"
,
"PAWS-X"
,
"QNLI"
,
"QQP"
,
"RTE"
,
"AX"
,
"COLA"
,
"IMDB"
,
"MNLI"
,
"MRPC"
,
"PAWS-X"
,
"QNLI"
,
"QQP"
,
"RTE"
,
"SST-2"
,
"STS-B"
,
"WNLI"
,
"XNLI"
,
"XTREME-XNLI"
,
"XTREME-PAWS-X"
,
"AX-g"
,
"SST-2"
,
"STS-B"
,
"WNLI"
,
"XNLI"
,
"XTREME-XNLI"
,
"XTREME-PAWS-X"
,
"
RTE-SuperGLU
E"
"
AX-g"
,
"SUPERGLUE-RT
E"
],
"The name of the task to train BERT classifier. The "
],
"The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english "
"of input tsv files; 2. the dev set for XTREME is english "
...
@@ -242,8 +242,8 @@ def generate_classifier_dataset():
...
@@ -242,8 +242,8 @@ def generate_classifier_dataset():
only_use_en_dev
=
FLAGS
.
only_use_en_dev
),
only_use_en_dev
=
FLAGS
.
only_use_en_dev
),
"ax-g"
:
"ax-g"
:
classifier_data_lib
.
AXgProcessor
,
classifier_data_lib
.
AXgProcessor
,
"
rte-
superglue"
:
"superglue
-rte
"
:
classifier_data_lib
.
RTE
SuperGLUEProcessor
classifier_data_lib
.
SuperGLUE
RTE
Processor
}
}
task_name
=
FLAGS
.
classification_task_name
.
lower
()
task_name
=
FLAGS
.
classification_task_name
.
lower
()
if
task_name
not
in
processors
:
if
task_name
not
in
processors
:
...
...
official/nlp/data/sentence_prediction_dataloader_test.py
View file @
2e9bb539
...
@@ -164,6 +164,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
...
@@ -164,6 +164,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
input_path
=
''
if
use_tfds
else
tf_record_path
,
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_download
=
True
,
text_fields
=
text_fields
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
seq_length
=
seq_length
,
...
@@ -195,6 +196,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
...
@@ -195,6 +196,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
input_path
=
''
if
use_tfds
else
tf_record_path
,
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_download
=
True
,
text_fields
=
text_fields
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
seq_length
=
seq_length
,
...
@@ -228,6 +230,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
...
@@ -228,6 +230,7 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
input_path
=
''
if
use_tfds
else
tf_record_path
,
input_path
=
''
if
use_tfds
else
tf_record_path
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_name
=
'glue/mrpc'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_split
=
'train'
if
use_tfds
else
''
,
tfds_download
=
True
,
text_fields
=
text_fields
,
text_fields
=
text_fields
,
global_batch_size
=
batch_size
,
global_batch_size
=
batch_size
,
seq_length
=
seq_length
,
seq_length
=
seq_length
,
...
...
official/nlp/data/wmt_dataloader.py
View file @
2e9bb539
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
# ==============================================================================
# ==============================================================================
"""Input pipeline for the transformer model to read, filter, and batch examples.
"""Input pipeline for the transformer model to read, filter, and batch examples.
1.
Batching scheme
Batching scheme
Prior to batching, elements in the dataset are grouped by length (max between
Prior to batching, elements in the dataset are grouped by length (max between
'inputs' and 'targets' length). Each group is then batched such that:
'inputs' and 'targets' length). Each group is then batched such that:
...
@@ -60,8 +60,8 @@ def _create_min_max_boundaries(max_length,
...
@@ -60,8 +60,8 @@ def _create_min_max_boundaries(max_length,
For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
returned values will be:
returned values will be:
buckets_min = [0, 4, 8, 16
, 24
]
buckets_min = [0, 4, 8, 16]
buckets_max = [4, 8, 16,
24,
25]
buckets_max = [4, 8, 16, 25]
Args:
Args:
max_length: The maximum length of example in dataset.
max_length: The maximum length of example in dataset.
...
...
official/nlp/docs/tfhub.md
0 → 100644
View file @
2e9bb539
# Exporting a pre-trained Encoder to TF Hub
## Overview
This doc explains how to use TF-NLP's
[
export_tfhub
](
https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py
)
tool to export pre-trained Transformer encoders to SavedModels suitable for
publication on TF Hub. (For the steps after that, see TF Hub's
[
publisher guide
](
https://www.tensorflow.org/hub/publish
)
.)
For testing purposes, those SavedModels can also be used from their export
locations on the filesystem.
On TF Hub, Transformer encoders for text come as a pair of SavedModels:
*
The preprocessing model applies a tokenizer with a fixed vocab plus some
additional logic to turn text into Transformer inputs.
*
The encoder model (or "model" for short) applies the pre-trained Transformer
encoder.
TF Hub defines
[
Common APIs
](
https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders
)
for all SavedModels of those two respective types, encapsulating the particular
choice of preprocessing logic and Encoder architecture.
## Exporting the Encoder
There is a choice between exporting just the encoder, or the encoder plus the
prediction head for the masked language model (MLM) task from pre-training.
Exporting just the encoder suffices for many straightforward applications.
### Exporting the Encoder alone
To export an encoder-only model, you can set
`--export_type=model`
and run the
tool like this:
```
shell
python official/nlp/tools/export_tfhub.py
\
--encoder_config_file
=
${
BERT_DIR
:?
}
/bert_encoder.yaml
\
--model_checkpoint_path
=
${
BERT_DIR
:?
}
/bert_model.ckpt
\
--vocab_file
=
${
BERT_DIR
:?
}
/vocab.txt
\
--export_type
=
model
\
--export_path
=
/tmp/bert_model
```
The flag
`--encoder_config_file`
refers to a YAML file representing the
[
encoders.EncoderConfig
](
https://github.com/tensorflow/models/search?q=EncoderConfig+path%3Aofficial%2Fnlp%2Fconfigs+filename%3Aencoders.py
)
dataclass, which supports multiple encoders (e.g., BERT, ALBERT). Instead of
`--encoder_config_file`
, you can set
`--bert_config_file`
to a legacy
`bert_config.json`
file to export a BERT model. If the model definition involves
[
GIN
](
https://github.com/google/gin-config
)
, the flags
`--gin_file`
and
`--gin_params`
must be set accordingly, consistent with pre-training.
The
`--model_checkpoint_path`
refers to an object-based (TF2) checkpoint written
by
[
BertPretrainerV2
](
https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py
)
,
or any other checkpoint that can be restored to
`tf.train.Checkpoint(encoder=encoder)`
for the encoder defined by the config
flags. Legacy checkpoints with
`model=`
instead of
`encoder=`
are also supported
for now.
The exported SavedModel expects dict inputs and outputs as follows, implementing
a specialization of the respective
[
Common SavedModel API
](
https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders
)
:
```
python
encoder
=
hub
.
load
(...)
encoder_inputs
=
dict
(
input_word_ids
=
...,
# Shape [batch, seq_length], dtype=int32
input_mask
=
...,
# Shape [batch, seq_length], dtype=int32
input_type_ids
=
...,
# Shape [batch, seq_length], dtype=int32
)
encoder_outputs
=
encoder
(
encoder_inputs
)
assert
encoder_outputs
.
keys
()
==
{
"pooled_output"
,
# Shape [batch_size, width], dtype=float32
"default"
,
# Alias for "pooled_output" (aligns with other models)
"sequence_output"
,
# Shape [batch_size, seq_length, width], dtype=float32
"encoder_outputs"
,
# List of Tensors with outputs of all transformer layers
}
```
The encoder's pooler layer is restored from the
`--model_checkpoint_path`
.
However, unlike classic BERT,
`BertPretrainerV2`
does not train the pooler layer
of the encoder. You have three options to handle that:
*
Set flag
`--copy_pooler_dense_to_encoder`
to copy the pooling layer from the
`ClassificationHead`
passed to
`BertPretrainerV2`
for the next sentence
prediction task. This mimicks classic BERT, but is not recommended for new
models (see next item).
*
Leave flag
`--copy_pooler_dense_to_encoder`
unset and export the untrained,
randomly initialized pooling layer of the encoder. Folklore (as of 2020) has
it that an untrained pooler gets fine-tuned better than a pre-trained
pooler, so this is the default.
*
Leave flag
`--copy_pooler_dense_to_encoder`
unset and perform your own
initialization of the pooling layer before export. For example, Google's
[
BERT Experts
](
https://tfhub.dev/google/collections/experts/bert/1
)
published in October 2020 initialize it to the identity map, reporting equal
gains if fine-tuning, and more predictable behavior if not.
In any case, at this time, the export tool requires the encoder model to
*have*
a
`pooled_output`
, whether trained or not. (This can be revised in the future.)
The encoder model does not include any preprocessing logic, but for the benefit
of users who take preprocessing into their own hands, the relevant information
is attached from flags
`--vocab_file`
or
`--sp_model_file`
, resp., and
`--do_lower_case`
, which need to be set in exactly the same way as for the
preprocessing model (see below).
The root object of the exported SavedModel stores the resulting values as
attributes on the root object:
```
python
encoder
=
hub
.
load
(...)
# Gets the filename of the respective tf.saved_model.Asset object.
if
hasattr
(
encoder
,
"vocab_file"
):
print
(
"Wordpiece vocab at"
,
encoder
.
vocab_file
.
asset_path
.
numpy
())
elif
hasattr
(
encoder
,
"sp_model_file"
):
print
(
"SentencePiece model at"
,
encoder
.
sp_model_file
.
asset_path
.
numpy
())
# Gets the value of a scalar bool tf.Variable.
print
(
"...using do_lower_case ="
,
encoder
.
do_lower_case
.
numpy
())
```
New users are encouraged to ignore these attributes and use the preprocessing
model instead. However, there are legacy users, and advanced users that require
access to the full vocab.
### Exporting the Encoder with a Masked Language Model head
To export an encoder and the masked language model it was trained with, first
read the preceding section about exporting just the encoder. All the
explanations there on setting the right flags apply here as well, up to the
following differences.
The masked language model is added to the export by changing flag
`--export_type`
from
`model`
to
`model_with_mlm`
, so the export command looks
like this:
```
shell
python official/nlp/tools/export_tfhub.py
\
--encoder_config_file
=
${
BERT_DIR
:?
}
/bert_encoder.yaml
\
--model_checkpoint_path
=
${
BERT_DIR
:?
}
/bert_model.ckpt
\
--vocab_file
=
${
BERT_DIR
:?
}
/vocab.txt
\
--export_type
=
model_with_mlm
\
--export_path
=
/tmp/bert_model
```
The
`--model_checkpoint_path`
refers to an object-based (TF2) checkpoint written
by
[
BertPretrainerV2
](
https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py
)
,
or any other checkpoint that can be restored to
`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`
with the encoder
defined by the config flags.
This is a more comprehensive requirement on the checkpoint than for
`--export_type=model`
; not all Transformer encoders and not all pre-training
techniques can satisfy it. For example,
[
ELECTRA
](
https://arxiv.org/abs/2003.10555
)
uses the BERT architecture but is
pre-trained without an MLM task.
The root object of the exported SavedModel is called in the same way as above.
In addition, the SavedModel has an
`mlm`
subobject that can be called as follows
to output an
`mlm_logits`
tensor as well:
```
python
mlm_inputs
=
dict
(
input_word_ids
=
...,
# Shape [batch, seq_length], dtype=int32
input_mask
=
...,
# Shape [batch, seq_length], dtype=int32
input_type_ids
=
...,
# Shape [batch, seq_length], dtype=int32
masked_lm_positions
=
...,
# Shape [batch, num_predictions], dtype=int32
)
mlm_outputs
=
encoder
.
mlm
(
mlm_inputs
)
assert
mlm_outputs
.
keys
()
==
{
"pooled_output"
,
# Shape [batch, width], dtype=float32
"sequence_output"
,
# Shape [batch, seq_length, width], dtype=float32
"encoder_outputs"
,
# List of Tensors with outputs of all transformer layers
"mlm_logits"
# Shape [batch, num_predictions, vocab_size], dtype=float32
}
```
The extra subobject imposes a moderate size overhead.
### Exporting from a TF1 BERT checkpoint
A BERT model trained with the
[
original BERT implementation for TF1
](
https://github.com/google-research/bert
)
can be exported after converting its checkpoint with the
[
tf2_encoder_checkpoint_converter
](
https://github.com/tensorflow/models/blob/master/official/nlp/bert/tf2_encoder_checkpoint_converter.py
)
tool.
After that, run
[
export_tfhub
](
https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py
)
per the instructions above on the converted checkpoint. Do not set
`--copy_pooler_dense_to_encoder`
, because the pooler layer is part of the
converted encoder. For
`--vocab_file`
and
`--do_lower_case`
, the values from TF1
BERT can be used verbatim.
## Exporting the preprocessing model
You can skip this step if TF Hub already has a preprocessing model that does
exactly what your encoder needs (same tokenizer, same vocab, same normalization
settings (
`do_lower_case`
)). You can inspect its collection of
[
Transformer Encoders for Text
](
https://tfhub.dev/google/collections/transformer_encoders_text/1
)
and click through to models with a similar input domain to find their
preprocessing models.
To export the preprocessing model, set
`--export_type=preprocessing`
and run the
export tool like this:
```
shell
python official/nlp/tools/export_tfhub.py
\
--vocab_file
=
${
BERT_DIR
:?
}
/vocab.txt
\
--do_lower_case
=
True
\
--export_type
=
preprocessing
\
--export_path
=
/tmp/bert_preprocessing
```
Note: Set flag
`--experimental_disable_assert_in_preprocessing`
when exporting
to users of the public TensorFlow releases 2.4.x to avoid a fatal ops placement
issue when preprocessing is used within Dataset.map() on TPU workers.
This is not an issue with TF2.3 and TF2.5+.
Flag
`--vocab_file`
specifies the vocab file used with
[
BertTokenizer
](
https://github.com/tensorflow/models/search?q=BertTokenizer+filename%3Atext_layers.py
)
.
For models that use the
[
SentencepieceTokenizer
](
https://github.com/tensorflow/models/search?q=SentencepieceTokenizer+filename%3Atext_layers.py
)
,
set flag
`--sp_model_file`
instead.
The boolean flag
`--do_lower_case`
controls text normalization (as in the
respective tokenizer classes, so it's a bit more than just smashing case). If
unset, do_lower_case will be enabled if 'uncased' appears in --vocab_file, or
unconditionally if --sp_model_file is set, mimicking the conventions of BERT and
ALBERT, respectively. For programmatic use, or if in doubt, it's best to set
`--do_lower_case`
explicity.
If the definition of preprocessing involved
[
GIN
](
https://github.com/google/gin-config
)
,
the flags
`--gin_file`
and
`--gin_params`
would have to be set accordingly,
consistent with pre-training. (At the time of this writing, no such GIN
configurables exist in the code.)
The exported SavedModel can be called in the following way for a single segment
input.
```
python
preprocessor
=
hub
.
load
(...)
text_input
=
...
# Shape [batch_size], dtype=tf.string
encoder_inputs
=
preprocessor
(
text_input
,
seq_length
=
seq_length
)
assert
encoder_inputs
.
keys
()
==
{
"input_word_ids"
,
# Shape [batch_size, seq_length], dtype=int32
"input_mask"
,
# Shape [batch_size, seq_length], dtype=int32
"input_type_ids"
# Shape [batch_size, seq_length], dtype=int32
}
```
Flag
`--default_seq_length`
controls the value of
`seq_length`
if that argument
is omitted in the usage example above. The flag defaults to 128, because
mutiples of 128 work best for Cloud TPUs, yet the cost of attention computation
grows quadratically with
`seq_length`
.
Beyond this example, the exported SavedModel implements the full set interface
from the preprocessor API for text embeddings with preprocessed inputs and with
Transformer encoders from TF Hub's
[
Common APIs for text
](
https://www.tensorflow.org/hub/common_saved_model_apis/text
)
.
Please see
[
tfhub.dev/tensorflow/bert_en_uncased_preprocess
](
https://tfhub.dev/tensorflow/bert_en_uncased_preprocess
)
for the full documentation of one preprocessing model exported with this tool,
especially how custom trimming of inputs can happen between
`.tokenize`
and
`.bert_pack_inputs`
.
Using the
`encoder.mlm()`
interface requires masking of tokenized inputs by user
code. The necessary information on the vocabulary encapsulated in the
preprocessing model can be obtained like this (uniformly across tokenizers):
```
python
special_tokens_dict
=
preprocess
.
tokenize
.
get_special_tokens_dict
()
vocab_size
=
int
(
special_tokens_dict
[
"vocab_size"
])
padding_id
=
int
(
special_tokens_dict
[
"padding_id"
])
# [PAD] or <pad>
start_of_sequence_id
=
int
(
special_tokens_dict
[
"start_of_sequence_id"
])
# [CLS]
end_of_segment_id
=
int
(
special_tokens_dict
[
"end_of_segment_id"
])
# [SEP]
mask_id
=
int
(
special_tokens_dict
[
"mask_id"
])
# [MASK]
```
## Testing the exported models
Please test your SavedModels before publication by fine-tuning them on a
suitable task and comparing performance and accuracy to a baseline experiment
built from equivalent Python code.
The
[
trainer doc
](
https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md
)
has instructions how to run BERT on MNLI and other tasks from the GLUE
benchmark.
official/nlp/modeling/layers/attention.py
View file @
2e9bb539
...
@@ -27,7 +27,7 @@ MultiHeadAttention = tf.keras.layers.MultiHeadAttention
...
@@ -27,7 +27,7 @@ MultiHeadAttention = tf.keras.layers.MultiHeadAttention
class
CachedAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
class
CachedAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
"""Attention layer with cache used for auto-agressive decoding.
"""Attention layer with cache used for auto-agressive decoding.
Arguments are the same as `MultiHeadAttention` layer.
Arguments are the same as `
tf.keras.layers.
MultiHeadAttention` layer.
"""
"""
def
_update_cache
(
self
,
key
,
value
,
cache
,
decode_loop_step
):
def
_update_cache
(
self
,
key
,
value
,
cache
,
decode_loop_step
):
...
...
official/nlp/modeling/layers/dense_einsum.py
View file @
2e9bb539
...
@@ -24,7 +24,7 @@ _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
...
@@ -24,7 +24,7 @@ _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
DenseEinsum
(
tf
.
keras
.
layers
.
Layer
):
class
DenseEinsum
(
tf
.
keras
.
layers
.
Layer
):
"""A densely connected layer that uses tf.einsum as the backing computation.
"""A densely connected layer that uses
`
tf.einsum
`
as the backing computation.
This layer can perform einsum calculations of arbitrary dimensionality.
This layer can perform einsum calculations of arbitrary dimensionality.
...
...
official/nlp/modeling/layers/gated_feedforward.py
View file @
2e9bb539
...
@@ -33,8 +33,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -33,8 +33,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
intermediate_activation: Activation for the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout: Dropout probability for the output dropout.
dropout: Dropout probability for the output dropout.
use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
activation and omitting bias, will apply
`GEGLU(x, W, V, W_2) = (GEGLU(xW)
activation and omitting bias, will apply
* xV)W2`; if False, will follow
`GEGLU(x, W, V, W_2) = (GEGLU(xW)
* xV)W2`; if False, will follow
"Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
"Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
num_blocks: The number of feedforward blocks to stack. Each block contains a
num_blocks: The number of feedforward blocks to stack. Each block contains a
...
@@ -43,8 +43,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -43,8 +43,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
dropout_position: Where to apply the dropout, the value can be either
dropout_position: Where to apply the dropout, the value can be either
`before_residual` or `after_residual`. If `before_residual`, will apply
`before_residual` or `after_residual`. If `before_residual`, will apply
`layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
`layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
`after residual`, will apply
`layer_output =
`after residual`, will apply
dropout(layer_norm(layer_output + layer_input))`.
`layer_output =
dropout(layer_norm(layer_output + layer_input))`.
kernel_initializer: Initializer for dense layer kernels.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
kernel_regularizer: Regularizer for dense layer kernels.
...
...
official/nlp/modeling/layers/masked_softmax.py
View file @
2e9bb539
...
@@ -22,7 +22,7 @@ def _large_compatible_negative(tensor_type):
...
@@ -22,7 +22,7 @@ def _large_compatible_negative(tensor_type):
"""Large negative number as Tensor.
"""Large negative number as Tensor.
This function is necessary because the standard value for epsilon
This function is necessary because the standard value for epsilon
in this module (-1e9) cannot be represented using tf.float16
in this module (-1e9) cannot be represented using
`
tf.float16
`.
Args:
Args:
tensor_type: a dtype to determine the type.
tensor_type: a dtype to determine the type.
...
...
official/nlp/modeling/layers/relative_attention.py
View file @
2e9bb539
...
@@ -75,7 +75,7 @@ class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
...
@@ -75,7 +75,7 @@ class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
"""A multi-head attention layer with relative attention + position encoding.
"""A multi-head attention layer with relative attention + position encoding.
This layer shares the same input/output projections as the common
This layer shares the same input/output projections as the common
MultiHeadAttention layer.
`tf.keras.layers.
MultiHeadAttention
`
layer.
When it calculates attention logits, position encoding is projected to form
When it calculates attention logits, position encoding is projected to form
relative keys. The logits are composed by shifted relative logits and content
relative keys. The logits are composed by shifted relative logits and content
...
@@ -333,8 +333,9 @@ class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
...
@@ -333,8 +333,9 @@ class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
The query stream only has access to contextual information and the position,
The query stream only has access to contextual information and the position,
but not the content.
but not the content.
This layer shares the same build signature as `MultiHeadRelativeAttention` but
This layer shares the same build signature as
has different input/output projections.
`tf.keras.layers.MultiHeadAttention` but has different input/output
projections.
**Note: This layer is currently experimental.
**Note: This layer is currently experimental.
...
...
official/nlp/modeling/layers/self_attention_mask.py
View file @
2e9bb539
...
@@ -23,7 +23,7 @@ from official.nlp.keras_nlp import layers
...
@@ -23,7 +23,7 @@ from official.nlp.keras_nlp import layers
class
SelfAttentionMask
(
layers
.
SelfAttentionMask
):
class
SelfAttentionMask
(
layers
.
SelfAttentionMask
):
"""Create 3D attention mask from a 2D tensor mask.
"""Create 3D attention mask from a 2D tensor mask.
**Warning: Please use the keras_nlp.layers.SelfAttentionMask.**
**Warning: Please use the
`
keras_nlp.layers.SelfAttentionMask
`
.**
inputs[0]: from_tensor: 2D or 3D Tensor of shape
inputs[0]: from_tensor: 2D or 3D Tensor of shape
[batch_size, from_seq_length, ...].
[batch_size, from_seq_length, ...].
inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
...
...
official/nlp/modeling/layers/talking_heads_attention.py
View file @
2e9bb539
...
@@ -33,7 +33,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
...
@@ -33,7 +33,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
multi-head attention by including linearprojections across the attention-heads
multi-head attention by including linearprojections across the attention-heads
dimension, immediately before and after the softmax operation.
dimension, immediately before and after the softmax operation.
See the base class `MultiHeadAttention` for more details.
See the base class `
tf.keras.layers.
MultiHeadAttention` for more details.
Args:
Args:
num_heads: Number of attention heads.
num_heads: Number of attention heads.
...
...
official/nlp/modeling/layers/text_layers.py
View file @
2e9bb539
...
@@ -97,8 +97,9 @@ class BertTokenizer(tf.keras.layers.Layer):
...
@@ -97,8 +97,9 @@ class BertTokenizer(tf.keras.layers.Layer):
"""Wraps BertTokenizer with pre-defined vocab as a Keras Layer.
"""Wraps BertTokenizer with pre-defined vocab as a Keras Layer.
Attributes:
Attributes:
tokenize_with_offsets: If true, calls BertTokenizer.tokenize_with_offsets()
tokenize_with_offsets: If true, calls
instead of plain .tokenize() and outputs a triple of
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
`text.BertTokenizer.tokenize()` and outputs a triple of
(tokens, start_offsets, limit_offsets).
(tokens, start_offsets, limit_offsets).
raw_table_access: An object with methods .lookup(keys) and .size()
raw_table_access: An object with methods .lookup(keys) and .size()
that operate on the raw lookup table of tokens. It can be used to
that operate on the raw lookup table of tokens. It can be used to
...
@@ -110,25 +111,26 @@ class BertTokenizer(tf.keras.layers.Layer):
...
@@ -110,25 +111,26 @@ class BertTokenizer(tf.keras.layers.Layer):
lower_case
:
bool
,
lower_case
:
bool
,
tokenize_with_offsets
:
bool
=
False
,
tokenize_with_offsets
:
bool
=
False
,
**
kwargs
):
**
kwargs
):
"""Initialize a BertTokenizer layer.
"""Initialize a
`
BertTokenizer
`
layer.
Args:
Args:
vocab_file: A Python string with the path of the vocabulary file.
vocab_file: A Python string with the path of the vocabulary file.
This is a text file with newline-separated wordpiece tokens.
This is a text file with newline-separated wordpiece tokens.
This layer initializes a lookup table from it that gets used with
This layer initializes a lookup table from it that gets used with
text.BertTokenizer.
`
text.BertTokenizer
`
.
lower_case: A Python boolean forwarded to text.BertTokenizer.
lower_case: A Python boolean forwarded to
`
text.BertTokenizer
`
.
If true, input text is converted to lower case (where applicable)
If true, input text is converted to lower case (where applicable)
before tokenization. This must be set to match the way in which
before tokenization. This must be set to match the way in which
the vocab_file was created.
the vocab_file was created.
tokenize_with_offsets: A Python boolean. If true, this layer calls
tokenize_with_offsets: A Python boolean. If true, this layer calls
BertTokenizer.tokenize_with_offsets() instead of plain .tokenize()
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
and outputs a triple of (tokens, start_offsets, limit_offsets)
`text.BertTokenizer.tokenize()` and outputs a triple of
(tokens, start_offsets, limit_offsets)
insead of just tokens.
insead of just tokens.
**kwargs: standard arguments to Layer().
**kwargs: standard arguments to Layer().
Raises:
Raises:
ImportError: if importing tensorflow_text failed.
ImportError: if importing
`
tensorflow_text
`
failed.
"""
"""
_check_if_tf_text_installed
()
_check_if_tf_text_installed
()
...
@@ -162,18 +164,18 @@ class BertTokenizer(tf.keras.layers.Layer):
...
@@ -162,18 +164,18 @@ class BertTokenizer(tf.keras.layers.Layer):
return
vocab_table
,
vocab_initializer
return
vocab_table
,
vocab_initializer
def
call
(
self
,
inputs
:
tf
.
Tensor
):
def
call
(
self
,
inputs
:
tf
.
Tensor
):
"""Calls text.BertTokenizer on inputs.
"""Calls
`
text.BertTokenizer
`
on inputs.
Args:
Args:
inputs: A string Tensor of shape [batch_size].
inputs: A string Tensor of shape [batch_size].
Returns:
Returns:
One or three of RaggedTensors if tokenize_with_offsets is False or
True,
One or three of
`
RaggedTensors
`
if
`
tokenize_with_offsets
`
is False or
respectively. These are
True,
respectively. These are
tokens: A RaggedTensor of shape [batch_size, (words), (pieces_per_word)]
tokens: A
`
RaggedTensor
`
of shape [batch_size, (words), (pieces_per_word)]
and type int32. tokens[i,j,k] contains the k-th wordpiece of the
and type int32. tokens[i,j,k] contains the k-th wordpiece of the
j-th word in the i-th input.
j-th word in the i-th input.
start_offsets, limit_offsets: If tokenize_with_offsets is True,
start_offsets, limit_offsets: If
`
tokenize_with_offsets
`
is True,
RaggedTensors of type int64 with the same indices as tokens.
RaggedTensors of type int64 with the same indices as tokens.
Element [i,j,k] contains the byte offset at the start, or past the
Element [i,j,k] contains the byte offset at the start, or past the
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
...
...
official/nlp/modeling/layers/transformer.py
View file @
2e9bb539
...
@@ -202,7 +202,7 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
...
@@ -202,7 +202,7 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
num_attention_heads
))
"heads (%d)"
%
(
hidden_size
,
self
.
num_attention_heads
))
self
.
attention_head_size
=
int
(
hidden_size
/
self
.
num_attention_heads
)
self
.
attention_head_size
=
int
(
hidden_size
)
/
/
self
.
num_attention_heads
common_kwargs
=
dict
(
common_kwargs
=
dict
(
bias_initializer
=
self
.
_bias_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
...
...
official/nlp/modeling/models/bert_pretrainer.py
View file @
2e9bb539
...
@@ -30,7 +30,7 @@ from official.nlp.modeling import networks
...
@@ -30,7 +30,7 @@ from official.nlp.modeling import networks
class
BertPretrainer
(
tf
.
keras
.
Model
):
class
BertPretrainer
(
tf
.
keras
.
Model
):
"""BERT pretraining model.
"""BERT pretraining model.
[Note] Please use the new BertPretrainerV2 for your projects.
[Note] Please use the new
`
BertPretrainerV2
`
for your projects.
The BertPretrainer allows a user to pass in a transformer stack, and
The BertPretrainer allows a user to pass in a transformer stack, and
instantiates the masked language model and classification networks that are
instantiates the masked language model and classification networks that are
...
...
official/nlp/modeling/models/electra_pretrainer.py
View file @
2e9bb539
...
@@ -37,7 +37,7 @@ class ElectraPretrainer(tf.keras.Model):
...
@@ -37,7 +37,7 @@ class ElectraPretrainer(tf.keras.Model):
that are used to create the training objectives.
that are used to create the training objectives.
*Note* that the model is constructed by Keras Subclass API, where layers are
*Note* that the model is constructed by Keras Subclass API, where layers are
defined inside __init__ and call() implements the computation.
defined inside
`
__init__
`
and
`
call()
`
implements the computation.
Args:
Args:
generator_network: A transformer network for generator, this network should
generator_network: A transformer network for generator, this network should
...
...
official/nlp/modeling/models/seq2seq_transformer.py
View file @
2e9bb539
...
@@ -591,5 +591,6 @@ class TransformerDecoder(tf.keras.layers.Layer):
...
@@ -591,5 +591,6 @@ class TransformerDecoder(tf.keras.layers.Layer):
def
attention_initializer
(
hidden_size
):
def
attention_initializer
(
hidden_size
):
"""Initializer for attention layers in Seq2SeqTransformer."""
"""Initializer for attention layers in Seq2SeqTransformer."""
hidden_size
=
int
(
hidden_size
)
limit
=
math
.
sqrt
(
6.0
/
(
hidden_size
+
hidden_size
))
limit
=
math
.
sqrt
(
6.0
/
(
hidden_size
+
hidden_size
))
return
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=-
limit
,
maxval
=
limit
)
return
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=-
limit
,
maxval
=
limit
)
official/nlp/modeling/networks/encoder_scaffold.py
View file @
2e9bb539
...
@@ -243,7 +243,6 @@ class EncoderScaffold(tf.keras.Model):
...
@@ -243,7 +243,6 @@ class EncoderScaffold(tf.keras.Model):
self
.
_position_embedding_layer
=
position_embedding_layer
self
.
_position_embedding_layer
=
position_embedding_layer
self
.
_type_embedding_layer
=
type_embedding_layer
self
.
_type_embedding_layer
=
type_embedding_layer
self
.
_embedding_norm_layer
=
embedding_norm_layer
self
.
_embedding_norm_layer
=
embedding_norm_layer
self
.
_embedding_network
=
embedding_network
self
.
_hidden_layers
=
hidden_layers
self
.
_hidden_layers
=
hidden_layers
if
self
.
_layer_norm_before_pooling
:
if
self
.
_layer_norm_before_pooling
:
self
.
_output_layer_norm
=
output_layer_norm
self
.
_output_layer_norm
=
output_layer_norm
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment