Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
5ffcc5b6
Unverified
Commit
5ffcc5b6
authored
Jul 21, 2021
by
Anirudh Vegesana
Committed by
GitHub
Jul 21, 2021
Browse files
Merge branch 'purdue-yolo' into detection_generator_pr
parents
0b81a843
76e0c014
Changes
190
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
601 additions
and
662 deletions
+601
-662
official/modeling/optimization/optimizer_factory.py
official/modeling/optimization/optimizer_factory.py
+7
-5
official/modeling/performance.py
official/modeling/performance.py
+11
-41
official/modeling/tf_utils.py
official/modeling/tf_utils.py
+1
-0
official/nlp/configs/encoders.py
official/nlp/configs/encoders.py
+12
-1
official/nlp/data/classifier_data_lib.py
official/nlp/data/classifier_data_lib.py
+345
-327
official/nlp/data/classifier_data_lib_test.py
official/nlp/data/classifier_data_lib_test.py
+95
-0
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+23
-3
official/nlp/data/sentence_prediction_dataloader.py
official/nlp/data/sentence_prediction_dataloader.py
+28
-19
official/nlp/data/sentence_prediction_dataloader_test.py
official/nlp/data/sentence_prediction_dataloader_test.py
+18
-9
official/nlp/keras_nlp/encoders/bert_encoder.py
official/nlp/keras_nlp/encoders/bert_encoder.py
+6
-0
official/nlp/keras_nlp/encoders/bert_encoder_test.py
official/nlp/keras_nlp/encoders/bert_encoder_test.py
+2
-1
official/nlp/keras_nlp/layers/position_embedding_test.py
official/nlp/keras_nlp/layers/position_embedding_test.py
+2
-2
official/nlp/keras_nlp/layers/transformer_encoder_block.py
official/nlp/keras_nlp/layers/transformer_encoder_block.py
+1
-1
official/nlp/modeling/layers/kernel_attention.py
official/nlp/modeling/layers/kernel_attention.py
+36
-36
official/nlp/modeling/layers/kernel_attention_test.py
official/nlp/modeling/layers/kernel_attention_test.py
+1
-1
official/nlp/modeling/layers/text_layers.py
official/nlp/modeling/layers/text_layers.py
+2
-117
official/nlp/modeling/layers/text_layers_test.py
official/nlp/modeling/layers/text_layers_test.py
+0
-96
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+3
-1
official/nlp/modeling/networks/bert_encoder.py
official/nlp/modeling/networks/bert_encoder.py
+6
-1
official/nlp/modeling/networks/bert_encoder_test.py
official/nlp/modeling/networks/bert_encoder_test.py
+2
-1
No files found.
official/modeling/optimization/optimizer_factory.py
View file @
5ffcc5b6
...
...
@@ -20,6 +20,7 @@ import tensorflow as tf
import
tensorflow_addons.optimizers
as
tfa_optimizers
from
official.modeling.optimization
import
slide_optimizer
from
official.modeling.optimization
import
adafactor_optimizer
from
official.modeling.optimization
import
ema_optimizer
from
official.modeling.optimization
import
lars_optimizer
from
official.modeling.optimization
import
lr_schedule
...
...
@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = {
'rmsprop'
:
tf
.
keras
.
optimizers
.
RMSprop
,
'lars'
:
lars_optimizer
.
LARS
,
'adagrad'
:
tf
.
keras
.
optimizers
.
Adagrad
,
'slide'
:
slide_optimizer
.
SLIDE
'slide'
:
slide_optimizer
.
SLIDE
,
'adafactor'
:
adafactor_optimizer
.
Adafactor
,
}
LR_CLS
=
{
'stepwise'
:
tf
.
keras
.
optimizers
.
schedule
s
.
PiecewiseConstantDecay
,
'polynomial'
:
tf
.
keras
.
optimizers
.
schedule
s
.
PolynomialDecay
,
'exponential'
:
tf
.
keras
.
optimizers
.
schedule
s
.
ExponentialDecay
,
'cosine'
:
tf
.
keras
.
experimental
.
CosineDecay
,
'stepwise'
:
lr_
schedule
.
PiecewiseConstantDecay
WithOffset
,
'polynomial'
:
lr_
schedule
.
PolynomialDecay
WithOffset
,
'exponential'
:
lr_
schedule
.
ExponentialDecay
WithOffset
,
'cosine'
:
lr_schedule
.
CosineDecay
WithOffset
,
'power'
:
lr_schedule
.
DirectPowerDecay
,
'power_linear'
:
lr_schedule
.
PowerAndLinearDecay
,
'power_with_offset'
:
lr_schedule
.
PowerDecayWithOffset
,
...
...
official/modeling/performance.py
View file @
5ffcc5b6
...
...
@@ -14,29 +14,16 @@
"""Functions and classes related to training performance."""
from
absl
import
logging
import
tensorflow
as
tf
def
configure_optimizer
(
optimizer
,
use_float16
=
False
,
use_graph_rewrite
=
False
,
loss_scale
=
'dynamic'
,
use_experimental_api
=
False
):
loss_scale
=
None
):
"""Configures optimizer object with performance options."""
if
use_experimental_api
:
logging
.
warning
(
'Passing use_experimental_api=True is deprecated. The '
'argument will be removed in the future.'
)
if
use_float16
:
# TODO(b/171936854): Move all methods to non-experimental api.
if
use_experimental_api
:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer
=
(
tf
.
keras
.
mixed_precision
.
experimental
.
LossScaleOptimizer
(
optimizer
,
loss_scale
=
loss_scale
))
elif
loss_scale
==
'dynamic'
:
if
loss_scale
in
(
None
,
'dynamic'
):
optimizer
=
tf
.
keras
.
mixed_precision
.
LossScaleOptimizer
(
optimizer
)
else
:
# loss_scale is a number. We interpret that as a fixed loss scale.
...
...
@@ -52,34 +39,17 @@ def configure_optimizer(optimizer,
return
optimizer
def
set_mixed_precision_policy
(
dtype
,
loss_scale
=
None
,
use_experimental_api
=
False
):
"""Sets mix precision policy."""
if
use_experimental_api
:
logging
.
warning
(
'Passing use_experimental_api=True is deprecated. The '
'argument will be removed in the future.'
)
assert
use_experimental_api
or
loss_scale
is
None
,
(
'loss_scale cannot be specified if use_experimental_api is False. If the '
'non-experimental API is used, specify the loss scaling configuration '
'when creating the LossScaleOptimizer instead.'
)
def
set_mixed_precision_policy
(
dtype
,
loss_scale
=
None
):
"""Sets the global `tf.keras.mixed_precision.Policy`."""
# TODO(b/191894773): Remove loss_scale argument
assert
loss_scale
is
None
,
(
'The loss_scale argument must be None. The argument exists for '
'historical reasons and will be removed soon.'
)
if
dtype
==
tf
.
float16
:
# TODO(b/171936854): Move all methods to non-experimental api.
if
use_experimental_api
:
policy
=
tf
.
keras
.
mixed_precision
.
experimental
.
Policy
(
'mixed_float16'
,
loss_scale
=
loss_scale
)
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
policy
)
else
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_float16'
)
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_float16'
)
elif
dtype
==
tf
.
bfloat16
:
if
use_experimental_api
:
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'mixed_bfloat16'
)
else
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_bfloat16'
)
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_bfloat16'
)
elif
dtype
==
tf
.
float32
:
if
use_experimental_api
:
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
else
:
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'float32'
)
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'float32'
)
else
:
raise
ValueError
(
'Unexpected dtype: %s'
%
dtype
)
official/modeling/tf_utils.py
View file @
5ffcc5b6
...
...
@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False):
"linear"
:
"linear"
,
"identity"
:
"linear"
,
"swish"
:
"swish"
,
"sigmoid"
:
"sigmoid"
,
"relu6"
:
tf
.
nn
.
relu6
,
}
if
identifier
in
keras_layer_allowlist
:
...
...
official/nlp/configs/encoders.py
View file @
5ffcc5b6
...
...
@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config):
embedding_size
:
Optional
[
int
]
=
None
output_range
:
Optional
[
int
]
=
None
return_all_encoder_outputs
:
bool
=
False
# Pre/Post-LN Transformer
norm_first
:
bool
=
False
@
dataclasses
.
dataclass
...
...
@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config):
intermediate_size
:
int
=
3072
dropout_rate
:
float
=
0.1
attention_dropout_rate
:
float
=
0.1
# Pre/Post-LN Transformer
norm_first
:
bool
=
False
max_position_embeddings
:
int
=
4096
num_rand_blocks
:
int
=
3
block_size
:
int
=
64
...
...
@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config):
intermediate_size
:
int
=
3072
dropout_rate
:
float
=
0.1
attention_dropout_rate
:
float
=
0.1
# Pre/Post-LN Transformer
norm_first
:
bool
=
False
max_position_embeddings
:
int
=
512
type_vocab_size
:
int
=
2
initializer_range
:
float
=
0.02
...
...
@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config):
redraw
:
bool
=
False
is_short_seq
:
bool
=
False
begin_kernel
:
int
=
0
scale
:
Optional
[
float
]
=
None
@
dataclasses
.
dataclass
...
...
@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg
.
hidden_activation
),
dropout_rate
=
encoder_cfg
.
dropout_rate
,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
norm_first
=
encoder_cfg
.
norm_first
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
attention_cls
=
layers
.
BigBirdAttention
,
...
...
@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig,
redraw
=
encoder_cfg
.
redraw
,
is_short_seq
=
encoder_cfg
.
is_short_seq
,
begin_kernel
=
encoder_cfg
.
begin_kernel
,
scale
=
encoder_cfg
.
scale
,
)
hidden_cfg
=
dict
(
num_attention_heads
=
encoder_cfg
.
num_attention_heads
,
...
...
@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg
.
hidden_activation
),
dropout_rate
=
encoder_cfg
.
dropout_rate
,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
norm_first
=
encoder_cfg
.
norm_first
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
attention_cls
=
layers
.
KernelAttention
,
...
...
@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig,
embedding_width
=
encoder_cfg
.
embedding_size
,
embedding_layer
=
embedding_layer
,
return_all_encoder_outputs
=
encoder_cfg
.
return_all_encoder_outputs
,
dict_outputs
=
True
)
dict_outputs
=
True
,
norm_first
=
encoder_cfg
.
norm_first
)
official/nlp/data/classifier_data_lib.py
View file @
5ffcc5b6
This diff is collapsed.
Click to expand it.
official/nlp/data/classifier_data_lib_test.py
0 → 100644
View file @
5ffcc5b6
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
import
os
import
tempfile
from
absl.testing
import
parameterized
import
tensorflow
as
tf
import
tensorflow_datasets
as
tfds
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
classifier_data_lib
def
decode_record
(
record
,
name_to_features
):
"""Decodes a record to a TensorFlow example."""
return
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
class
BertClassifierLibTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
BertClassifierLibTest
,
self
).
setUp
()
self
.
model_dir
=
self
.
get_temp_dir
()
self
.
processors
=
{
"CB"
:
classifier_data_lib
.
CBProcessor
,
"SUPERGLUE-RTE"
:
classifier_data_lib
.
SuperGLUERTEProcessor
,
"BOOLQ"
:
classifier_data_lib
.
BoolQProcessor
,
"WIC"
:
classifier_data_lib
.
WiCProcessor
,
}
vocab_tokens
=
[
"[UNK]"
,
"[CLS]"
,
"[SEP]"
,
"want"
,
"##want"
,
"##ed"
,
"wa"
,
"un"
,
"runn"
,
"##ing"
,
","
]
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]).
encode
(
"utf-8"
))
vocab_file
=
vocab_writer
.
name
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
)
@
parameterized
.
parameters
(
{
"task_type"
:
"CB"
},
{
"task_type"
:
"BOOLQ"
},
{
"task_type"
:
"SUPERGLUE-RTE"
},
{
"task_type"
:
"WIC"
},
)
def
test_generate_dataset_from_tfds_processor
(
self
,
task_type
):
with
tfds
.
testing
.
mock_data
(
num_examples
=
5
):
output_path
=
os
.
path
.
join
(
self
.
model_dir
,
task_type
)
processor
=
self
.
processors
[
task_type
]()
classifier_data_lib
.
generate_tf_record_from_data_file
(
processor
,
None
,
self
.
tokenizer
,
train_data_output_path
=
output_path
,
eval_data_output_path
=
output_path
,
test_data_output_path
=
output_path
)
files
=
tf
.
io
.
gfile
.
glob
(
output_path
)
self
.
assertNotEmpty
(
files
)
train_dataset
=
tf
.
data
.
TFRecordDataset
(
output_path
)
seq_length
=
128
label_type
=
tf
.
int64
name_to_features
=
{
"input_ids"
:
tf
.
io
.
FixedLenFeature
([
seq_length
],
tf
.
int64
),
"input_mask"
:
tf
.
io
.
FixedLenFeature
([
seq_length
],
tf
.
int64
),
"segment_ids"
:
tf
.
io
.
FixedLenFeature
([
seq_length
],
tf
.
int64
),
"label_ids"
:
tf
.
io
.
FixedLenFeature
([],
label_type
),
}
train_dataset
=
train_dataset
.
map
(
lambda
record
:
decode_record
(
record
,
name_to_features
))
# If data is retrieved without error, then all requirements
# including data type/shapes are met.
_
=
next
(
iter
(
train_dataset
))
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/data/create_finetuning_data.py
View file @
5ffcc5b6
...
...
@@ -50,7 +50,7 @@ flags.DEFINE_enum(
"classification_task_name"
,
"MNLI"
,
[
"AX"
,
"COLA"
,
"IMDB"
,
"MNLI"
,
"MRPC"
,
"PAWS-X"
,
"QNLI"
,
"QQP"
,
"RTE"
,
"SST-2"
,
"STS-B"
,
"WNLI"
,
"XNLI"
,
"XTREME-XNLI"
,
"XTREME-PAWS-X"
,
"AX-g"
,
"SUPERGLUE-RTE"
,
"CB"
,
"BoolQ"
"AX-g"
,
"SUPERGLUE-RTE"
,
"CB"
,
"BoolQ"
,
"WIC"
],
"The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english "
...
...
@@ -173,8 +173,26 @@ flags.DEFINE_string(
def
generate_classifier_dataset
():
"""Generates classifier dataset and returns input meta data."""
assert
(
FLAGS
.
input_data_dir
and
FLAGS
.
classification_task_name
or
FLAGS
.
tfds_params
)
if
FLAGS
.
classification_task_name
in
[
"COLA"
,
"WNLI"
,
"SST-2"
,
"MRPC"
,
"QQP"
,
"STS-B"
,
"MNLI"
,
"QNLI"
,
"RTE"
,
"AX"
,
"SUPERGLUE-RTE"
,
"CB"
,
"BoolQ"
,
"WIC"
,
]:
assert
not
FLAGS
.
input_data_dir
or
FLAGS
.
tfds_params
else
:
assert
(
FLAGS
.
input_data_dir
and
FLAGS
.
classification_task_name
or
FLAGS
.
tfds_params
)
if
FLAGS
.
tokenization
==
"WordPiece"
:
tokenizer
=
tokenization
.
FullTokenizer
(
...
...
@@ -248,6 +266,8 @@ def generate_classifier_dataset():
classifier_data_lib
.
CBProcessor
,
"boolq"
:
classifier_data_lib
.
BoolQProcessor
,
"wic"
:
classifier_data_lib
.
WnliProcessor
,
}
task_name
=
FLAGS
.
classification_task_name
.
lower
()
if
task_name
not
in
processors
:
...
...
official/nlp/data/sentence_prediction_dataloader.py
View file @
5ffcc5b6
...
...
@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
else
:
self
.
_label_name_mapping
=
dict
()
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""De
codes a serialized tf.Example
."""
def
name_to_features_spec
(
self
):
"""De
fines features to decode. Subclass may override to append features
."""
label_type
=
LABEL_TYPES_MAP
[
self
.
_params
.
label_type
]
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
...
...
@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
if
self
.
_include_example_id
:
name_to_features
[
'example_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
return
name_to_features
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
example
=
tf
.
io
.
parse_single_example
(
record
,
self
.
name_to_features_spec
())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
...
...
@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{
'input_
word_
ids'
:
record
[
'input_ids'
]
,
'input_mask'
:
record
[
'input_mask'
]
,
'
input_type_ids'
:
record
[
'segment
_ids'
]
key_mapping
=
{
'input_ids'
:
'input_
word_
ids'
,
'input_mask'
:
'input_mask'
,
'
segment_ids'
:
'input_type
_ids'
}
if
self
.
_include_example_id
:
x
[
'example_id'
]
=
record
[
'example_id'
]
x
[
self
.
_label_field
]
=
record
[
self
.
_label_field
]
ret
=
{}
for
record_key
in
record
:
if
record_key
in
key_mapping
:
ret
[
key_mapping
[
record_key
]]
=
record
[
record_key
]
else
:
ret
[
record_key
]
=
record
[
record_key
]
if
self
.
_label_field
in
self
.
_label_name_mapping
:
x
[
self
.
_label_name_mapping
[
self
.
_label_field
]]
=
record
[
self
.
_label_field
]
ret
[
self
.
_label_name_mapping
[
self
.
_label_field
]]
=
record
[
self
.
_label_field
]
return
x
return
ret
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
...
...
@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
"""Berts preprocess."""
segments
=
[
record
[
x
]
for
x
in
self
.
_text_fields
]
model_inputs
=
self
.
_text_processor
(
segments
)
if
self
.
_include_example_i
d
:
model_inputs
[
'example_id'
]
=
record
[
'example_id'
]
model_inputs
[
self
.
_label_field
]
=
record
[
self
.
_label_field
]
for
key
in
recor
d
:
if
key
not
in
self
.
_text_fields
:
model_inputs
[
key
]
=
record
[
key
]
return
model_inputs
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
def
name_to_features_spec
(
self
):
name_to_features
=
{}
for
text_field
in
self
.
_text_fields
:
name_to_features
[
text_field
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
string
)
...
...
@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
name_to_features
[
self
.
_label_field
]
=
tf
.
io
.
FixedLenFeature
([],
label_type
)
if
self
.
_include_example_id
:
name_to_features
[
'example_id'
]
=
tf
.
io
.
FixedLenFeature
([],
tf
.
int64
)
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
return
name_to_features
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
example
=
tf
.
io
.
parse_single_example
(
record
,
self
.
name_to_features_spec
())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
example
:
...
...
official/nlp/data/sentence_prediction_dataloader_test.py
View file @
5ffcc5b6
...
...
@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
label_field
=
data_config
.
label_field
self
.
assertCountEqual
(
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
],
features
.
keys
())
expected_keys
=
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
]
if
use_tfds
:
expected_keys
+=
[
'idx'
]
self
.
assertCountEqual
(
expected_keys
,
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
...
...
@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
label_field
=
data_config
.
label_field
self
.
assertCountEqual
(
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
],
features
.
keys
())
expected_keys
=
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
]
if
use_tfds
:
expected_keys
+=
[
'idx'
]
self
.
assertCountEqual
(
expected_keys
,
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
...
...
@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset
=
loader
.
SentencePredictionTextDataLoader
(
data_config
).
load
()
features
=
next
(
iter
(
dataset
))
label_field
=
data_config
.
label_field
self
.
assertCountEqual
(
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
],
features
.
keys
())
expected_keys
=
[
'input_word_ids'
,
'input_type_ids'
,
'input_mask'
,
label_field
]
if
use_tfds
:
expected_keys
+=
[
'idx'
]
self
.
assertCountEqual
(
expected_keys
,
features
.
keys
())
self
.
assertEqual
(
features
[
'input_word_ids'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_mask'
].
shape
,
(
batch_size
,
seq_length
))
self
.
assertEqual
(
features
[
'input_type_ids'
].
shape
,
(
batch_size
,
seq_length
))
...
...
official/nlp/keras_nlp/encoders/bert_encoder.py
View file @
5ffcc5b6
...
...
@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model):
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
"""
def
__init__
(
...
...
@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model):
output_range
=
None
,
embedding_width
=
None
,
embedding_layer
=
None
,
norm_first
=
False
,
**
kwargs
):
activation
=
tf
.
keras
.
activations
.
get
(
inner_activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
...
...
@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model):
inner_activation
=
inner_activation
,
output_dropout
=
output_dropout
,
attention_dropout
=
attention_dropout
,
norm_first
=
norm_first
,
output_range
=
transformer_output_range
,
kernel_initializer
=
initializer
,
name
=
'transformer/layer_%d'
%
i
)
...
...
@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model):
'output_range'
:
output_range
,
'embedding_width'
:
embedding_width
,
'embedding_layer'
:
embedding_layer
,
'norm_first'
:
norm_first
,
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
...
...
official/nlp/keras_nlp/encoders/bert_encoder_test.py
View file @
5ffcc5b6
...
...
@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
initializer
=
"glorot_uniform"
,
output_range
=-
1
,
embedding_width
=
16
,
embedding_layer
=
None
)
embedding_layer
=
None
,
norm_first
=
False
)
network
=
bert_encoder
.
BertEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"inner_activation"
]
=
tf
.
keras
.
activations
.
serialize
(
...
...
official/nlp/keras_nlp/layers/position_embedding_test.py
View file @
5ffcc5b6
...
...
@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
sequence_length
,
seq_axis
=
2
)
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
,
width
))
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
width
,
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
sequence_length
,
width
,
width
]
expected_output_shape
=
[
None
,
width
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float32
,
output_tensor
.
dtype
)
...
...
official/nlp/keras_nlp/layers/transformer_encoder_block.py
View file @
5ffcc5b6
...
...
@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
attention.
Returns:
An ouput tensor with the same dimensions as input/query tensor.
An ou
t
put tensor with the same dimensions as input/query tensor.
"""
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
...
...
official/nlp/modeling/layers/kernel_attention.py
View file @
5ffcc5b6
...
...
@@ -85,30 +85,20 @@ def create_projection_matrix(m, d, seed=None):
return
tf
.
linalg
.
matmul
(
tf
.
linalg
.
diag
(
multiplier
),
final_matrix
)
def
_generalized_kernel
(
x
,
projection_matrix
,
is_query
,
f
,
h
,
data_normalizer_fn
=
None
):
def
_generalized_kernel
(
x
,
projection_matrix
,
f
,
h
):
"""Generalized kernel in RETHINKING ATTENTION WITH PERFORMERS.
Args:
x: The feature being transformed with shape [B, T, N ,H].
projection_matrix: The matrix with shape [M, H] that we projecct x to, where
M is the number of projections.
is_query: Whether the transform is a query or key. This transform is
symmetric is the argument is not used.
f: A non-linear function applied on x or projected x.
h: A muliplier which is a function of x applied after projected and
transformed. Only applied if projection_matrix is not None.
data_normalizer_fn: A function which takes x and returns a scalar that
normalize data.
Returns:
Transformed feature.
"""
# No asymmetric operations.
del
is_query
if
data_normalizer_fn
is
not
None
:
x
=
data_normalizer_fn
(
x
)
if
projection_matrix
is
None
:
return
h
(
x
)
*
f
(
x
)
...
...
@@ -139,26 +129,18 @@ _TRANSFORM_MAP = {
x
-
tf
.
math
.
reduce_max
(
x
,
axis
=
[
1
,
2
,
3
],
keepdims
=
True
)),
h
=
lambda
x
:
tf
.
math
.
exp
(
-
0.5
*
tf
.
math
.
reduce_sum
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)),
data_normalizer_fn
=
lambda
x
:
x
/
(
tf
.
math
.
sqrt
(
tf
.
math
.
sqrt
(
tf
.
cast
(
tf
.
shape
(
x
)[
-
1
],
tf
.
float32
))))),
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)),),
"expmod"
:
functools
.
partial
(
_generalized_kernel
,
# Avoid exp explosion by shifting.
f
=
lambda
x
:
tf
.
math
.
exp
(
x
-
tf
.
math
.
reduce_max
(
x
,
axis
=
[
1
,
2
,
3
],
keepdims
=
True
)),
h
=
lambda
x
:
tf
.
math
.
exp
(
-
0.5
*
tf
.
math
.
sqrt
(
tf
.
cast
(
tf
.
shape
(
x
)[
-
1
],
tf
.
float32
))),
data_normalizer_fn
=
lambda
x
:
x
/
(
tf
.
math
.
sqrt
(
tf
.
math
.
sqrt
(
tf
.
cast
(
tf
.
shape
(
x
)[
-
1
],
tf
.
float32
))))),
"l2"
:
functools
.
partial
(
_generalized_kernel
,
f
=
lambda
x
:
x
,
h
=
lambda
x
:
tf
.
math
.
sqrt
(
tf
.
cast
(
tf
.
shape
(
x
)[
-
1
],
tf
.
float32
)),
data_normalizer_fn
=
lambda
x
:
x
),
"identity"
:
lambda
x
,
projection_matrix
,
is_query
:
x
f
=
lambda
x
:
tf
.
math
.
exp
(
x
-
tf
.
math
.
reduce_max
(
x
,
axis
=
[
1
,
2
,
3
],
keepdims
=
True
)),
h
=
lambda
x
:
tf
.
math
.
exp
(
-
0.5
*
tf
.
math
.
sqrt
(
tf
.
cast
(
tf
.
shape
(
x
)[
-
1
],
tf
.
float32
))),
),
"identity"
:
functools
.
partial
(
_generalized_kernel
,
f
=
lambda
x
:
x
,
h
=
lambda
x
:
1
)
}
# pylint: enable=g-long-lambda
...
...
@@ -170,7 +152,7 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
Rethinking Attention with Performers
(https://arxiv.org/abs/2009.14794)
- exp (Lemma 1, positive), relu
, l2
- exp (Lemma 1, positive), relu
- random/deterministic projection
Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention
...
...
@@ -195,14 +177,14 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
redraw
=
False
,
is_short_seq
=
False
,
begin_kernel
=
0
,
scale
=
None
,
**
kwargs
):
r
"""Constructor of KernelAttention.
Args:
feature_transform: A non-linear transform of the keys and quries.
Possible transforms are "elu", "relu", "square", "exp", "expmod",
"l2", "identity". If <is_short_seq> = True, it is recommended to choose
feature_transform as "l2".
"identity".
num_random_features: Number of random features to be used for projection.
if num_random_features <= 0, no production is used before transform.
seed: The seed to begin drawing random features. Once the seed is set, the
...
...
@@ -216,6 +198,8 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
(default option).
begin_kernel: Apply kernel_attention after this sequence id and apply
softmax attention before this.
scale: The value to scale the dot product as described in `Attention Is
All You Need`. If None, we use 1/sqrt(dk) as described in the paper.
**kwargs: The same arguments `MultiHeadAttention` layer.
"""
if
feature_transform
not
in
_TRANSFORM_MAP
:
...
...
@@ -234,8 +218,11 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
# 1. inference
# 2. no redraw
self
.
_seed
=
seed
super
().
__init__
(
**
kwargs
)
if
scale
is
None
:
self
.
_scale
=
1.0
/
math
.
sqrt
(
float
(
self
.
_key_dim
))
else
:
self
.
_scale
=
scale
self
.
_projection_matrix
=
None
if
num_random_features
>
0
:
self
.
_projection_matrix
=
create_projection_matrix
(
...
...
@@ -275,7 +262,6 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
Returns:
attention_output: Multi-headed outputs of attention computation.
"""
projection_matrix
=
None
if
self
.
_num_random_features
>
0
:
if
self
.
_redraw
and
training
:
...
...
@@ -284,8 +270,20 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
else
:
projection_matrix
=
self
.
_projection_matrix
key
=
_TRANSFORM_MAP
[
feature_transform
](
key
,
projection_matrix
,
False
)
query
=
_TRANSFORM_MAP
[
feature_transform
](
query
,
projection_matrix
,
True
)
if
is_short_seq
:
# Note: Applying scalar multiply at the smaller end of einsum improves
# XLA performance, but may introduce slight numeric differences in
# the Transformer attention head.
query
=
query
*
self
.
_scale
else
:
# Note: we suspect spliting the scale to key, query yields smaller
# approximation variance when random projection is used.
# For simplicity, we also split when there's no random projection.
key
*=
math
.
sqrt
(
self
.
_scale
)
query
*=
math
.
sqrt
(
self
.
_scale
)
key
=
_TRANSFORM_MAP
[
feature_transform
](
key
,
projection_matrix
)
query
=
_TRANSFORM_MAP
[
feature_transform
](
query
,
projection_matrix
)
if
attention_mask
is
not
None
:
key
=
tf
.
einsum
(
"BSNH,BS->BSNH"
,
key
,
attention_mask
)
...
...
@@ -294,13 +292,14 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
attention_scores
=
tf
.
einsum
(
"BTNH,BSNH->BTSN"
,
query
,
key
)
attention_scores
=
tf
.
nn
.
softmax
(
attention_scores
,
axis
=
2
)
attention_output
=
tf
.
einsum
(
"BTSN,BSNH->BTNH"
,
attention_scores
,
value
)
return
attention_output
else
:
kv
=
tf
.
einsum
(
"BSNH,BSND->BNDH"
,
key
,
value
)
denominator
=
1.0
/
(
tf
.
einsum
(
"BTNH,BNH->BTN"
,
query
,
tf
.
reduce_sum
(
key
,
axis
=
1
))
+
_NUMERIC_STABLER
)
return
tf
.
einsum
(
"BTNH,BNDH,BTN->BTND"
,
query
,
kv
,
denominator
)
attention_output
=
tf
.
einsum
(
"BTNH,BNDH,BTN->BTND"
,
query
,
kv
,
denominator
)
return
attention_output
def
_build_from_signature
(
self
,
query
,
value
,
key
=
None
):
super
().
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
...
...
@@ -391,6 +390,7 @@ class KernelAttention(tf.keras.layers.MultiHeadAttention):
"redraw"
:
self
.
_redraw
,
"is_short_seq"
:
self
.
_is_short_seq
,
"begin_kernel"
:
self
.
_begin_kernel
,
"scale"
:
self
.
_scale
,
}
base_config
=
super
().
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
official/nlp/modeling/layers/kernel_attention_test.py
View file @
5ffcc5b6
...
...
@@ -21,7 +21,7 @@ import tensorflow as tf
from
official.nlp.modeling.layers
import
kernel_attention
as
attention
_FEATURE_TRANSFORM
=
[
'relu'
,
'elu'
,
'exp'
,
'l2'
]
_FEATURE_TRANSFORM
=
[
'relu'
,
'elu'
,
'exp'
]
_REDRAW
=
[
True
,
False
]
_TRAINING
=
[
True
,
False
]
_IS_SHORT_SEQ
=
[
True
,
False
]
...
...
official/nlp/modeling/layers/text_layers.py
View file @
5ffcc5b6
...
...
@@ -33,121 +33,6 @@ def _check_if_tf_text_installed():
"'tensorflow-text-nightly'."
)
def
_iterative_vectorized_fair_share
(
capacity
:
tf
.
Tensor
,
limit
:
Union
[
int
,
tf
.
Tensor
]):
"""Iterative algorithm for max min fairness algorithm.
Reference: https://en.wikipedia.org/wiki/Max-min_fairness
The idea is for each example with some number of segments and a limit of
total segment length allowed, we grant each segment a fair share of the
limit. For example, if every segment has the same length, no work to do.
If one segment has below average length, its share will be spilt to others
fairly. In this way, the longest segment will be the shortest among all
potential capacity assignments.
Args:
capacity: A rank-2 Tensor of #Segments x Batch.
limit: The largest permissible number of tokens in total across one example.
Returns:
A rank-2 Tensor with new segment capacity assignment such that
the total number of tokens in each example does not exceed the `limit`.
"""
# Firstly, we calculate the lower bound of the capacity assignment.
per_seg_limit
=
limit
//
capacity
.
shape
[
0
]
limit_mask
=
tf
.
ones
(
capacity
.
shape
,
dtype
=
tf
.
int64
)
*
per_seg_limit
lower_bound
=
tf
.
minimum
(
capacity
,
limit_mask
)
# This step makes up the capacity that already statisfy the capacity limit.
remaining_cap_sum
=
limit
-
tf
.
math
.
reduce_sum
(
lower_bound
,
axis
=
0
)
remaining_cap_mat
=
capacity
-
lower_bound
new_cap
=
lower_bound
+
remaining_cap_mat
*
tf
.
cast
(
tf
.
math
.
reduce_sum
(
remaining_cap_mat
,
axis
=
0
)
<=
remaining_cap_sum
,
tf
.
int64
)
# Process iteratively. This step is O(#segments), see analysis below.
while
True
:
remaining_limit
=
limit
-
tf
.
math
.
reduce_sum
(
new_cap
,
axis
=
0
)
remaining_cap
=
capacity
-
new_cap
masked_remaining_slots
=
tf
.
cast
(
remaining_cap
>
0
,
tf
.
int64
)
remaining_cap_col_slots
=
tf
.
reduce_sum
(
masked_remaining_slots
,
axis
=
0
)
masked_remaining_limit
=
tf
.
cast
(
remaining_cap_col_slots
>
0
,
tf
.
int64
)
*
remaining_limit
# Total remaining segment limit is different for each example.
per_seg_limit
=
masked_remaining_limit
//
(
tf
.
cast
(
remaining_cap_col_slots
<=
0
,
tf
.
int64
)
+
remaining_cap_col_slots
)
# +1 to make sure 0/0 = 0
# Note that for each step, there is at least one more segment being
# fulfilled or the loop is finished.
# The idea is, if remaining per example limit > smallest among segments,
# the smallest segment ask is fullfilled. Otherwise, all remaining segments
# are truncated, the assignment is finished.
if
tf
.
math
.
reduce_sum
(
per_seg_limit
)
>
0
:
remaining_slots_mat
=
tf
.
cast
(
remaining_cap
>
0
,
tf
.
int64
)
new_cap
=
new_cap
+
remaining_slots_mat
*
per_seg_limit
else
:
# Leftover assignment of limit that is smaller than #slots.
new_remained_assignment_mask
=
tf
.
cast
(
(
tf
.
cumsum
(
masked_remaining_slots
,
axis
=
0
)
<=
masked_remaining_limit
)
&
(
masked_remaining_slots
>
0
),
tf
.
int64
)
new_cap
=
new_cap
+
new_remained_assignment_mask
break
return
new_cap
def
round_robin_truncate_inputs
(
inputs
:
Union
[
tf
.
RaggedTensor
,
List
[
tf
.
RaggedTensor
]],
limit
:
Union
[
int
,
tf
.
Tensor
],
)
->
Union
[
tf
.
RaggedTensor
,
List
[
tf
.
RaggedTensor
]]:
"""Truncates a list of batched segments to fit a per-example length limit.
Available space is assigned one token at a time in a round-robin fashion
to the inputs that still need some, until the limit is reached.
(Or equivalently: the longest input is truncated by one token until the total
length of inputs fits the limit.) Examples that fit the limit as passed in
remain unchanged.
Args:
inputs: A list of rank-2 RaggedTensors. The i-th example is given by
the i-th row in each list element, that is, `inputs[:][i, :]`.
limit: The largest permissible number of tokens in total across one example.
Returns:
A list of rank-2 RaggedTensors at corresponding indices with the inputs,
in which the rows of each RaggedTensor have been truncated such that
the total number of tokens in each example does not exceed the `limit`.
"""
if
not
isinstance
(
inputs
,
(
list
,
tuple
)):
return
round_robin_truncate_inputs
([
inputs
],
limit
)[
0
]
limit
=
tf
.
cast
(
limit
,
tf
.
int64
)
if
not
all
(
rt
.
shape
.
rank
==
2
for
rt
in
inputs
):
raise
ValueError
(
"All inputs must have shape [batch_size, (items)]"
)
if
len
(
inputs
)
==
1
:
return
[
_truncate_row_lengths
(
inputs
[
0
],
limit
)]
elif
len
(
inputs
)
==
2
:
size_a
,
size_b
=
[
rt
.
row_lengths
()
for
rt
in
inputs
]
# Here's a brain-twister: This does round-robin assignment of quota
# to both inputs until the limit is reached. Hint: consider separately
# the cases of zero, one, or two inputs exceeding half the limit.
floor_half
=
limit
//
2
ceil_half
=
limit
-
floor_half
quota_a
=
tf
.
minimum
(
size_a
,
ceil_half
+
tf
.
nn
.
relu
(
floor_half
-
size_b
))
quota_b
=
tf
.
minimum
(
size_b
,
floor_half
+
tf
.
nn
.
relu
(
ceil_half
-
size_a
))
return
[
_truncate_row_lengths
(
inputs
[
0
],
quota_a
),
_truncate_row_lengths
(
inputs
[
1
],
quota_b
)]
else
:
# Note that we don't merge with the 2 input case because the full algorithm
# is more expensive.
capacity
=
tf
.
stack
([
rt
.
row_lengths
()
for
rt
in
inputs
])
# #Segments x B
new_capacity
=
_iterative_vectorized_fair_share
(
capacity
,
limit
)
return
[
_truncate_row_lengths
(
inputs
[
i
],
new_capacity
[
i
])
for
i
in
range
(
capacity
.
shape
[
0
])
]
def
_truncate_row_lengths
(
ragged_tensor
:
tf
.
RaggedTensor
,
new_lengths
:
tf
.
Tensor
)
->
tf
.
RaggedTensor
:
"""Truncates the rows of `ragged_tensor` to the given row lengths."""
...
...
@@ -675,8 +560,8 @@ class BertPackInputs(tf.keras.layers.Layer):
# fall back to some ad-hoc truncation.
num_special_tokens
=
len
(
inputs
)
+
1
if
truncator
==
"round_robin"
:
trimmed_segments
=
r
ound
_r
obin
_truncate_inputs
(
inputs
,
seq_length
-
num_special_tokens
)
trimmed_segments
=
text
.
R
ound
R
obin
Trimmer
(
seq_length
-
num_special_tokens
)
.
trim
(
inputs
)
elif
truncator
==
"waterfall"
:
trimmed_segments
=
text
.
WaterfallTrimmer
(
seq_length
-
num_special_tokens
).
trim
(
inputs
)
...
...
official/nlp/modeling/layers/text_layers_test.py
View file @
5ffcc5b6
...
...
@@ -24,102 +24,6 @@ from sentencepiece import SentencePieceTrainer
from
official.nlp.modeling.layers
import
text_layers
class
RoundRobinTruncatorTest
(
tf
.
test
.
TestCase
):
def
_test_input
(
self
,
start
,
lengths
):
return
tf
.
ragged
.
constant
([[
start
+
10
*
j
+
i
for
i
in
range
(
length
)]
for
j
,
length
in
enumerate
(
lengths
)],
dtype
=
tf
.
int32
)
def
test_single_segment
(
self
):
# Single segment.
single_input
=
self
.
_test_input
(
11
,
[
4
,
5
,
6
])
expected_single_output
=
tf
.
ragged
.
constant
(
[[
11
,
12
,
13
,
14
],
[
21
,
22
,
23
,
24
,
25
],
[
31
,
32
,
33
,
34
,
35
],
# Truncated.
])
self
.
assertAllEqual
(
expected_single_output
,
text_layers
.
round_robin_truncate_inputs
(
single_input
,
limit
=
5
))
# Test wrapping in a singleton list.
actual_single_list_output
=
text_layers
.
round_robin_truncate_inputs
(
[
single_input
],
limit
=
5
)
self
.
assertIsInstance
(
actual_single_list_output
,
list
)
self
.
assertAllEqual
(
expected_single_output
,
actual_single_list_output
[
0
])
def
test_two_segments
(
self
):
input_a
=
self
.
_test_input
(
111
,
[
1
,
2
,
2
,
3
,
4
,
5
])
input_b
=
self
.
_test_input
(
211
,
[
1
,
3
,
4
,
2
,
2
,
5
])
expected_a
=
tf
.
ragged
.
constant
(
[[
111
],
[
121
,
122
],
[
131
,
132
],
[
141
,
142
,
143
],
[
151
,
152
,
153
],
# Truncated.
[
161
,
162
,
163
],
# Truncated.
])
expected_b
=
tf
.
ragged
.
constant
(
[[
211
],
[
221
,
222
,
223
],
[
231
,
232
,
233
],
# Truncated.
[
241
,
242
],
[
251
,
252
],
[
261
,
262
],
# Truncated.
])
actual_a
,
actual_b
=
text_layers
.
round_robin_truncate_inputs
(
[
input_a
,
input_b
],
limit
=
5
)
self
.
assertAllEqual
(
expected_a
,
actual_a
)
self
.
assertAllEqual
(
expected_b
,
actual_b
)
def
test_three_segments
(
self
):
input_a
=
self
.
_test_input
(
111
,
[
1
,
2
,
2
,
3
,
4
,
5
,
1
])
input_b
=
self
.
_test_input
(
211
,
[
1
,
3
,
4
,
2
,
2
,
5
,
8
])
input_c
=
self
.
_test_input
(
311
,
[
1
,
3
,
4
,
2
,
2
,
5
,
10
])
seg_limit
=
8
expected_a
=
tf
.
ragged
.
constant
([
[
111
],
[
121
,
122
],
[
131
,
132
],
[
141
,
142
,
143
],
[
151
,
152
,
153
,
154
],
[
161
,
162
,
163
],
# Truncated
[
171
]
])
expected_b
=
tf
.
ragged
.
constant
([
[
211
],
[
221
,
222
,
223
],
[
231
,
232
,
233
],
# Truncated
[
241
,
242
],
[
251
,
252
],
[
261
,
262
,
263
],
# Truncated
[
271
,
272
,
273
,
274
]
# Truncated
])
expected_c
=
tf
.
ragged
.
constant
([
[
311
],
[
321
,
322
,
323
],
[
331
,
332
,
333
],
# Truncated
[
341
,
342
],
[
351
,
352
],
[
361
,
362
],
# Truncated
[
371
,
372
,
373
]
# Truncated
])
actual_a
,
actual_b
,
actual_c
=
text_layers
.
round_robin_truncate_inputs
(
[
input_a
,
input_b
,
input_c
],
limit
=
seg_limit
)
self
.
assertAllEqual
(
expected_a
,
actual_a
)
self
.
assertAllEqual
(
expected_b
,
actual_b
)
self
.
assertAllEqual
(
expected_c
,
actual_c
)
input_cap
=
tf
.
math
.
reduce_sum
(
tf
.
stack
([
rt
.
row_lengths
()
for
rt
in
[
input_a
,
input_b
,
input_c
]]),
axis
=
0
)
per_example_usage
=
tf
.
math
.
reduce_sum
(
tf
.
stack
([
rt
.
row_lengths
()
for
rt
in
[
actual_a
,
actual_b
,
actual_c
]]),
axis
=
0
)
self
.
assertTrue
(
all
(
per_example_usage
<=
tf
.
minimum
(
seg_limit
,
input_cap
)))
# This test covers the in-process behavior of a BertTokenizer layer.
# For saving, restoring, and the restored behavior (incl. shape inference),
# see nlp/tools/export_tfhub_lib_test.py.
...
...
official/nlp/modeling/models/seq2seq_transformer.py
View file @
5ffcc5b6
...
...
@@ -111,13 +111,15 @@ class Seq2SeqTransformer(tf.keras.Model):
def
_embedding_linear
(
self
,
embedding_matrix
,
x
):
"""Uses embeddings as linear transformation weights."""
embedding_matrix
=
tf
.
cast
(
embedding_matrix
,
dtype
=
self
.
compute_dtype
)
x
=
tf
.
cast
(
x
,
dtype
=
self
.
compute_dtype
)
batch_size
=
tf
.
shape
(
x
)[
0
]
length
=
tf
.
shape
(
x
)[
1
]
hidden_size
=
tf
.
shape
(
x
)[
2
]
vocab_size
=
tf
.
shape
(
embedding_matrix
)[
0
]
x
=
tf
.
reshape
(
x
,
[
-
1
,
hidden_size
])
logits
=
tf
.
matmul
(
x
,
tf
.
cast
(
embedding_matrix
,
x
.
dtype
),
transpose_b
=
True
)
logits
=
tf
.
matmul
(
x
,
embedding_matrix
,
transpose_b
=
True
)
return
tf
.
reshape
(
logits
,
[
batch_size
,
length
,
vocab_size
])
...
...
official/nlp/modeling/networks/bert_encoder.py
View file @
5ffcc5b6
...
...
@@ -77,6 +77,9 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
parameter is originally added for ELECTRA model which needs to tie the
generator embeddings with the discriminator embeddings.
dict_outputs: Whether to use a dictionary as the model outputs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
"""
def
__init__
(
self
,
...
...
@@ -97,6 +100,7 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
embedding_width
=
None
,
embedding_layer
=
None
,
dict_outputs
=
False
,
norm_first
=
False
,
**
kwargs
):
# b/164516224
...
...
@@ -120,7 +124,8 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
initializer
=
initializer
,
output_range
=
output_range
,
embedding_width
=
embedding_width
,
embedding_layer
=
embedding_layer
)
embedding_layer
=
embedding_layer
,
norm_first
=
norm_first
)
self
.
_embedding_layer_instance
=
embedding_layer
...
...
official/nlp/modeling/networks/bert_encoder_test.py
View file @
5ffcc5b6
...
...
@@ -226,7 +226,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
output_range
=-
1
,
embedding_width
=
16
,
dict_outputs
=
True
,
embedding_layer
=
None
)
embedding_layer
=
None
,
norm_first
=
False
)
network
=
bert_encoder
.
BertEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"activation"
]
=
tf
.
keras
.
activations
.
serialize
(
...
...
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment