Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
ca552843
"host/host_tensor/include/conv_common.hpp" did not exist on "c15ff3c825dd25ddc2f6ee572f867c925a4528bd"
Unverified
Commit
ca552843
authored
Sep 16, 2021
by
Srihari Humbarwadi
Committed by
GitHub
Sep 16, 2021
Browse files
Merge branch 'panoptic-segmentation' into panoptic-segmentation
parents
7e2f7a35
6b90e134
Changes
283
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
776 additions
and
101 deletions
+776
-101
official/modeling/hyperparams/base_config.py
official/modeling/hyperparams/base_config.py
+5
-7
official/modeling/multitask/base_model.py
official/modeling/multitask/base_model.py
+0
-15
official/modeling/multitask/base_trainer.py
official/modeling/multitask/base_trainer.py
+0
-15
official/modeling/multitask/evaluator.py
official/modeling/multitask/evaluator.py
+8
-1
official/modeling/multitask/task_sampler.py
official/modeling/multitask/task_sampler.py
+4
-1
official/nlp/bert/model_training_utils.py
official/nlp/bert/model_training_utils.py
+1
-1
official/nlp/configs/experiment_configs.py
official/nlp/configs/experiment_configs.py
+1
-0
official/nlp/data/pretrain_dynamic_dataloader_test.py
official/nlp/data/pretrain_dynamic_dataloader_test.py
+7
-4
official/nlp/data/squad_lib_sp.py
official/nlp/data/squad_lib_sp.py
+1
-1
official/nlp/keras_nlp/layers/position_embedding.py
official/nlp/keras_nlp/layers/position_embedding.py
+1
-7
official/nlp/modeling/layers/cls_head.py
official/nlp/modeling/layers/cls_head.py
+35
-6
official/nlp/modeling/layers/cls_head_test.py
official/nlp/modeling/layers/cls_head_test.py
+5
-0
official/nlp/modeling/layers/mobile_bert_layers.py
official/nlp/modeling/layers/mobile_bert_layers.py
+0
-19
official/nlp/modeling/layers/mobile_bert_layers_test.py
official/nlp/modeling/layers/mobile_bert_layers_test.py
+0
-16
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+13
-2
official/nlp/modeling/networks/classification.py
official/nlp/modeling/networks/classification.py
+3
-0
official/nlp/modeling/networks/encoder_scaffold.py
official/nlp/modeling/networks/encoder_scaffold.py
+29
-6
official/nlp/modeling/networks/encoder_scaffold_test.py
official/nlp/modeling/networks/encoder_scaffold_test.py
+92
-0
official/nlp/modeling/networks/funnel_transformer.py
official/nlp/modeling/networks/funnel_transformer.py
+311
-0
official/nlp/modeling/networks/funnel_transformer_test.py
official/nlp/modeling/networks/funnel_transformer_test.py
+260
-0
No files found.
official/modeling/hyperparams/base_config.py
View file @
ca552843
...
...
@@ -49,6 +49,11 @@ class Config(params_dict.ParamsDict):
default_params
:
dataclasses
.
InitVar
[
Optional
[
Mapping
[
str
,
Any
]]]
=
None
restrictions
:
dataclasses
.
InitVar
[
Optional
[
List
[
str
]]]
=
None
def
__post_init__
(
self
,
default_params
,
restrictions
):
super
().
__init__
(
default_params
=
default_params
,
restrictions
=
restrictions
)
@
classmethod
def
_isvalidsequence
(
cls
,
v
):
"""Check if the input values are valid sequences.
...
...
@@ -140,13 +145,6 @@ class Config(params_dict.ParamsDict):
else
subconfig_type
)
return
subconfig_type
def
__post_init__
(
self
,
default_params
,
restrictions
,
*
args
,
**
kwargs
):
super
().
__init__
(
default_params
=
default_params
,
restrictions
=
restrictions
,
*
args
,
**
kwargs
)
def
_set
(
self
,
k
,
v
):
"""Overrides same method in ParamsDict.
...
...
official/modeling/multitask/base_model.py
View file @
ca552843
...
...
@@ -12,21 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Abstraction of multi-task model."""
from
typing
import
Text
,
Dict
...
...
official/modeling/multitask/base_trainer.py
View file @
ca552843
...
...
@@ -12,21 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Multitask base trainer implementation.
The trainer derives from the Orbit `StandardTrainer` class.
...
...
official/modeling/multitask/evaluator.py
View file @
ca552843
...
...
@@ -54,8 +54,15 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
self
.
_model
=
model
self
.
_global_step
=
global_step
or
orbit
.
utils
.
create_global_step
()
self
.
_checkpoint_exporter
=
checkpoint_exporter
if
hasattr
(
self
.
model
,
"checkpoint_items"
):
checkpoint_items
=
self
.
model
.
checkpoint_items
else
:
checkpoint_items
=
{}
self
.
_checkpoint
=
tf
.
train
.
Checkpoint
(
global_step
=
self
.
global_step
,
model
=
self
.
model
)
model
=
self
.
model
,
global_step
=
self
.
global_step
,
**
checkpoint_items
)
self
.
_validation_losses
=
None
self
.
_validation_metrics
=
None
...
...
official/modeling/multitask/task_sampler.py
View file @
ca552843
...
...
@@ -78,7 +78,10 @@ class ProportionalTaskSampler(TaskSampler):
class
AnnealingTaskSampler
(
TaskSampler
):
"""Sample tasks according to task weights as well as training progress."""
"""Sample tasks according to task weights as well as training progress.
See http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf
"""
def
__init__
(
self
,
task_weights
:
Dict
[
Text
,
Union
[
float
,
int
]],
...
...
official/nlp/bert/model_training_utils.py
View file @
ca552843
...
...
@@ -22,7 +22,7 @@ from absl import logging
import
tensorflow
as
tf
from
tensorflow.python.util
import
deprecation
from
official.common
import
distribute_utils
from
official.
staging.train
ing
import
grad_utils
from
official.
model
ing
import
grad_utils
_SUMMARY_TXT
=
'training_summary.txt'
_MIN_SUMMARY_STEPS
=
10
...
...
official/nlp/configs/experiment_configs.py
View file @
ca552843
...
...
@@ -17,3 +17,4 @@
from
official.nlp.configs
import
finetuning_experiments
from
official.nlp.configs
import
pretraining_experiments
from
official.nlp.configs
import
wmt_transformer_experiments
from
official.nlp.projects.teams
import
teams_experiments
official/nlp/data/pretrain_dynamic_dataloader_test.py
View file @
ca552843
...
...
@@ -43,10 +43,11 @@ def _create_fake_dataset(output_path, seq_length, num_masked_tokens,
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
rng
=
np
.
random
.
default_rng
(
37
)
for
_
in
range
(
num_examples
):
features
=
{}
padding
=
np
.
zeros
(
shape
=
(
max_seq_length
-
seq_length
),
dtype
=
np
.
int32
)
input_ids
=
np
.
random
.
randint
(
low
=
1
,
high
=
100
,
size
=
(
seq_length
))
input_ids
=
rng
.
integers
(
low
=
1
,
high
=
100
,
size
=
(
seq_length
))
features
[
'input_ids'
]
=
create_int_feature
(
np
.
concatenate
((
input_ids
,
padding
)))
features
[
'input_mask'
]
=
create_int_feature
(
...
...
@@ -56,9 +57,9 @@ def _create_fake_dataset(output_path, seq_length, num_masked_tokens,
features
[
'position_ids'
]
=
create_int_feature
(
np
.
concatenate
((
np
.
ones_like
(
input_ids
),
padding
)))
features
[
'masked_lm_positions'
]
=
create_int_feature
(
np
.
random
.
randint
(
60
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
rng
.
integers
(
60
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
features
[
'masked_lm_ids'
]
=
create_int_feature
(
np
.
random
.
randint
(
100
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
rng
.
integers
(
100
,
size
=
(
num_masked_tokens
),
dtype
=
np
.
int64
))
features
[
'masked_lm_weights'
]
=
create_float_feature
(
np
.
ones
((
num_masked_tokens
,),
dtype
=
np
.
float32
))
features
[
'next_sentence_labels'
]
=
create_int_feature
(
np
.
array
([
0
]))
...
...
@@ -156,6 +157,7 @@ class PretrainDynamicDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
self
.
assertEqual
(
dynamic_metrics
[
key
],
static_metrics
[
key
])
def
test_load_dataset
(
self
):
tf
.
random
.
set_seed
(
0
)
max_seq_length
=
128
batch_size
=
2
input_path_1
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'train_1.tf_record'
)
...
...
@@ -178,7 +180,8 @@ class PretrainDynamicDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
input_path
=
input_paths
,
seq_bucket_lengths
=
[
64
,
128
],
use_position_id
=
True
,
global_batch_size
=
batch_size
)
global_batch_size
=
batch_size
,
deterministic
=
True
)
dataset
=
pretrain_dynamic_dataloader
.
PretrainingDynamicDataLoader
(
data_config
).
load
()
dataset_it
=
iter
(
dataset
)
...
...
official/nlp/data/squad_lib_sp.py
View file @
ca552843
...
...
@@ -175,7 +175,7 @@ def _convert_index(index, pos, m=None, is_start=True):
front
-=
1
assert
index
[
front
]
is
not
None
or
index
[
rear
]
is
not
None
if
index
[
front
]
is
None
:
if
index
[
rear
]
>=
1
:
if
index
[
rear
]
>=
1
:
# pytype: disable=unsupported-operands
if
is_start
:
return
0
else
:
...
...
official/nlp/keras_nlp/layers/position_embedding.py
View file @
ca552843
...
...
@@ -66,14 +66,8 @@ class PositionEmbedding(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
dimension_list
=
input_shape
.
as_list
()
seq_length
=
dimension_list
[
self
.
_seq_axis
]
width
=
dimension_list
[
-
1
]
if
self
.
_max_length
is
not
None
:
weight_sequence_length
=
self
.
_max_length
else
:
weight_sequence_length
=
seq_length
weight_sequence_length
=
self
.
_max_length
self
.
_position_embeddings
=
self
.
add_weight
(
"embeddings"
,
...
...
official/nlp/modeling/layers/cls_head.py
View file @
ca552843
...
...
@@ -59,19 +59,33 @@ class ClassificationHead(tf.keras.layers.Layer):
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
"pooler_dense"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
out_proj
=
tf
.
keras
.
layers
.
Dense
(
units
=
num_classes
,
kernel_initializer
=
self
.
initializer
,
name
=
"logits"
)
def
call
(
self
,
features
):
def
call
(
self
,
features
:
tf
.
Tensor
,
only_project
:
bool
=
False
):
"""Implements call().
Args:
features: a rank-3 Tensor when self.inner_dim is specified, otherwise
it is a rank-2 Tensor.
only_project: a boolean. If True, we return the intermediate Tensor
before projecting to class logits.
Returns:
a Tensor, if only_project is True, shape= [batch size, hidden size].
If only_project is False, shape= [batch size, num classes].
"""
if
not
self
.
inner_dim
:
x
=
features
else
:
x
=
features
[:,
self
.
cls_token_idx
,
:]
# take <CLS> token.
x
=
self
.
dense
(
x
)
x
=
self
.
dropout
(
x
)
if
only_project
:
return
x
x
=
self
.
dropout
(
x
)
x
=
self
.
out_proj
(
x
)
return
x
...
...
@@ -134,7 +148,7 @@ class MultiClsHeads(tf.keras.layers.Layer):
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
"pooler_dense"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
out_projs
=
[]
for
name
,
num_classes
in
cls_list
:
self
.
out_projs
.
append
(
...
...
@@ -142,13 +156,28 @@ class MultiClsHeads(tf.keras.layers.Layer):
units
=
num_classes
,
kernel_initializer
=
self
.
initializer
,
name
=
name
))
def
call
(
self
,
features
):
def
call
(
self
,
features
:
tf
.
Tensor
,
only_project
:
bool
=
False
):
"""Implements call().
Args:
features: a rank-3 Tensor when self.inner_dim is specified, otherwise
it is a rank-2 Tensor.
only_project: a boolean. If True, we return the intermediate Tensor
before projecting to class logits.
Returns:
If only_project is True, a Tensor with shape= [batch size, hidden size].
If only_project is False, a dictionary of Tensors.
"""
if
not
self
.
inner_dim
:
x
=
features
else
:
x
=
features
[:,
self
.
cls_token_idx
,
:]
# take <CLS> token.
x
=
self
.
dense
(
x
)
x
=
self
.
dropout
(
x
)
if
only_project
:
return
x
x
=
self
.
dropout
(
x
)
outputs
=
{}
for
proj_layer
in
self
.
out_projs
:
...
...
official/nlp/modeling/layers/cls_head_test.py
View file @
ca552843
...
...
@@ -39,6 +39,8 @@ class ClassificationHeadTest(tf.test.TestCase, parameterized.TestCase):
self
.
assertAllClose
(
output
,
[[
0.
,
0.
],
[
0.
,
0.
]])
self
.
assertSameElements
(
test_layer
.
checkpoint_items
.
keys
(),
[
"pooler_dense"
])
outputs
=
test_layer
(
features
,
only_project
=
True
)
self
.
assertEqual
(
outputs
.
shape
,
(
2
,
5
))
def
test_layer_serialization
(
self
):
layer
=
cls_head
.
ClassificationHead
(
10
,
2
)
...
...
@@ -71,6 +73,9 @@ class MultiClsHeadsTest(tf.test.TestCase, parameterized.TestCase):
self
.
assertSameElements
(
test_layer
.
checkpoint_items
.
keys
(),
[
"pooler_dense"
,
"foo"
,
"bar"
])
outputs
=
test_layer
(
features
,
only_project
=
True
)
self
.
assertEqual
(
outputs
.
shape
,
(
2
,
5
))
def
test_layer_serialization
(
self
):
cls_list
=
[(
"foo"
,
2
),
(
"bar"
,
3
)]
test_layer
=
cls_head
.
MultiClsHeads
(
inner_dim
=
5
,
cls_list
=
cls_list
)
...
...
official/nlp/modeling/layers/mobile_bert_layers.py
View file @
ca552843
...
...
@@ -39,23 +39,6 @@ class NoNorm(tf.keras.layers.Layer):
return
output
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
NoNormClipped
(
NoNorm
):
"""Quantization friendly implementation for the NoNorm.
The output of NoNorm layer is clipped to [-6.0, 6.0] to make it quantization
friendly.
"""
def
__init__
(
self
,
name
=
None
):
super
(
NoNormClipped
,
self
).
__init__
(
name
=
name
)
def
call
(
self
,
feature
):
output
=
feature
*
self
.
scale
+
self
.
bias
clipped_output
=
tf
.
clip_by_value
(
output
,
-
6.0
,
6.0
)
return
clipped_output
def
_get_norm_layer
(
normalization_type
=
'no_norm'
,
name
=
None
):
"""Get normlization layer.
...
...
@@ -69,8 +52,6 @@ def _get_norm_layer(normalization_type='no_norm', name=None):
"""
if
normalization_type
==
'no_norm'
:
layer
=
NoNorm
(
name
=
name
)
elif
normalization_type
==
'no_norm_clipped'
:
layer
=
NoNormClipped
(
name
=
name
)
elif
normalization_type
==
'layer_norm'
:
layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
name
,
...
...
official/nlp/modeling/layers/mobile_bert_layers_test.py
View file @
ca552843
...
...
@@ -33,22 +33,6 @@ def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
return
fake_input
class
EdgeTPUNoNormTest
(
tf
.
test
.
TestCase
):
def
test_no_norm
(
self
):
layer
=
mobile_bert_layers
.
NoNormClipped
()
feature
=
tf
.
random
.
uniform
(
[
2
,
3
,
4
],
minval
=-
8
,
maxval
=
8
,
dtype
=
tf
.
float32
)
output
=
layer
(
feature
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
2
,
3
,
4
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
output_min
=
tf
.
reduce_min
(
output
)
output_max
=
tf
.
reduce_max
(
output
)
self
.
assertGreaterEqual
(
6.0
,
output_max
)
self
.
assertLessEqual
(
-
6.0
,
output_min
)
class
MobileBertEncoderTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
def
test_embedding_layer_with_token_type
(
self
):
...
...
official/nlp/modeling/models/seq2seq_transformer.py
View file @
ca552843
...
...
@@ -544,7 +544,8 @@ class TransformerDecoder(tf.keras.layers.Layer):
self_attention_mask
=
None
,
cross_attention_mask
=
None
,
cache
=
None
,
decode_loop_step
=
None
):
decode_loop_step
=
None
,
return_all_decoder_outputs
=
False
):
"""Return the output of the decoder layer stacks.
Args:
...
...
@@ -561,6 +562,9 @@ class TransformerDecoder(tf.keras.layers.Layer):
...}
decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU.
return_all_decoder_outputs: Return all decoder layer outputs.
Note that the outputs are layer normed.
This is useful when introducing per layer auxiliary loss.
Returns:
Output of decoder.
...
...
@@ -568,6 +572,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
"""
output_tensor
=
target
decoder_outputs
=
[]
for
layer_idx
in
range
(
self
.
num_layers
):
transformer_inputs
=
[
output_tensor
,
memory
,
cross_attention_mask
,
self_attention_mask
...
...
@@ -581,7 +586,13 @@ class TransformerDecoder(tf.keras.layers.Layer):
transformer_inputs
,
cache
=
cache
[
cache_layer_idx
],
decode_loop_step
=
decode_loop_step
)
return
self
.
output_normalization
(
output_tensor
)
if
return_all_decoder_outputs
:
decoder_outputs
.
append
(
self
.
output_normalization
(
output_tensor
))
if
return_all_decoder_outputs
:
return
decoder_outputs
else
:
return
self
.
output_normalization
(
output_tensor
)
def
attention_initializer
(
hidden_size
):
...
...
official/nlp/modeling/networks/classification.py
View file @
ca552843
...
...
@@ -16,6 +16,7 @@
# pylint: disable=g-classes-have-attributes
import
collections
import
tensorflow
as
tf
from
tensorflow.python.util
import
deprecation
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
...
...
@@ -39,6 +40,8 @@ class Classification(tf.keras.Model):
`predictions`.
"""
@
deprecation
.
deprecated
(
None
,
'Classification as a network is deprecated. '
'Please use the layers.ClassificationHead instead.'
)
def
__init__
(
self
,
input_width
,
num_classes
,
...
...
official/nlp/modeling/networks/encoder_scaffold.py
View file @
ca552843
...
...
@@ -74,9 +74,12 @@ class EncoderScaffold(tf.keras.Model):
standard pretraining.
num_hidden_instances: The number of times to instantiate and/or invoke the
hidden_cls.
hidden_cls: The class or instance to encode the input data. If `hidden_cls`
is not set, a KerasBERT transformer layer will be used as the encoder
class.
hidden_cls: Three types of input are supported: (1) class (2) instance
(3) list of classes or instances, to encode the input data. If
`hidden_cls` is not set, a KerasBERT transformer layer will be used as the
encoder class. If `hidden_cls` is a list of classes or instances, these
classes (instances) are sequentially instantiated (invoked) on top of
embedding layer. Mixing classes and instances in the list is allowed.
hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
instantiated. If hidden_cls is not set, a config dict must be passed to
`hidden_cfg` with the following values:
...
...
@@ -192,15 +195,26 @@ class EncoderScaffold(tf.keras.Model):
layer_output_data
=
[]
hidden_layers
=
[]
hidden_cfg
=
hidden_cfg
if
hidden_cfg
else
{}
if
isinstance
(
hidden_cls
,
list
)
and
len
(
hidden_cls
)
!=
num_hidden_instances
:
raise
RuntimeError
(
(
'When input hidden_cls to EncoderScaffold %s is a list, it must '
'contain classes or instances with size specified by '
'num_hidden_instances, got %d vs %d.'
)
%
self
.
name
,
len
(
hidden_cls
),
num_hidden_instances
)
for
i
in
range
(
num_hidden_instances
):
if
inspect
.
isclass
(
hidden_cls
):
if
isinstance
(
hidden_cls
,
list
):
cur_hidden_cls
=
hidden_cls
[
i
]
else
:
cur_hidden_cls
=
hidden_cls
if
inspect
.
isclass
(
cur_hidden_cls
):
if
hidden_cfg
and
'attention_cfg'
in
hidden_cfg
and
(
layer_idx_as_attention_seed
):
hidden_cfg
=
copy
.
deepcopy
(
hidden_cfg
)
hidden_cfg
[
'attention_cfg'
][
'seed'
]
=
i
layer
=
hidden_cls
(
**
hidden_cfg
)
layer
=
cur_
hidden_cls
(
**
hidden_cfg
)
else
:
layer
=
hidden_cls
layer
=
cur_
hidden_cls
data
=
layer
([
data
,
attention_mask
])
layer_output_data
.
append
(
data
)
hidden_layers
.
append
(
layer
)
...
...
@@ -347,6 +361,15 @@ class EncoderScaffold(tf.keras.Model):
else
:
return
self
.
_embedding_data
@
property
def
embedding_network
(
self
):
if
self
.
_embedding_network
is
None
:
raise
RuntimeError
(
(
'The EncoderScaffold %s does not have a reference '
'to the embedding network. This is required when you '
'pass a custom embedding network to the scaffold.'
)
%
self
.
name
)
return
self
.
_embedding_network
@
property
def
hidden_layers
(
self
):
"""List of hidden layers in the encoder."""
...
...
official/nlp/modeling/networks/encoder_scaffold_test.py
View file @
ca552843
...
...
@@ -605,6 +605,98 @@ class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
self
.
assertNotEmpty
(
call_list
)
self
.
assertTrue
(
call_list
[
0
],
"The passed layer class wasn't instantiated."
)
def
test_hidden_cls_list
(
self
):
hidden_size
=
32
sequence_length
=
10
vocab_size
=
57
embedding_network
=
Embeddings
(
vocab_size
,
hidden_size
)
call_list
=
[]
hidden_cfg
=
{
"num_attention_heads"
:
2
,
"intermediate_size"
:
3072
,
"intermediate_activation"
:
activations
.
gelu
,
"dropout_rate"
:
0.1
,
"attention_dropout_rate"
:
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"call_list"
:
call_list
}
mask_call_list
=
[]
mask_cfg
=
{
"call_list"
:
mask_call_list
}
# Create a small EncoderScaffold for testing. This time, we pass an already-
# instantiated layer object.
xformer
=
ValidatedTransformerLayer
(
**
hidden_cfg
)
xmask
=
ValidatedMaskLayer
(
**
mask_cfg
)
test_network_a
=
encoder_scaffold
.
EncoderScaffold
(
num_hidden_instances
=
3
,
pooled_output_dim
=
hidden_size
,
pooler_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
hidden_cls
=
xformer
,
mask_cls
=
xmask
,
embedding_cls
=
embedding_network
)
# Create a network b with same embedding and hidden layers as network a.
test_network_b
=
encoder_scaffold
.
EncoderScaffold
(
num_hidden_instances
=
3
,
pooled_output_dim
=
hidden_size
,
pooler_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
mask_cls
=
xmask
,
embedding_cls
=
test_network_a
.
embedding_network
,
hidden_cls
=
test_network_a
.
hidden_layers
)
# Create a network c with same embedding but fewer hidden layers compared to
# network a and b.
hidden_layers
=
test_network_a
.
hidden_layers
hidden_layers
.
pop
()
test_network_c
=
encoder_scaffold
.
EncoderScaffold
(
num_hidden_instances
=
2
,
pooled_output_dim
=
hidden_size
,
pooler_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
mask_cls
=
xmask
,
embedding_cls
=
test_network_a
.
embedding_network
,
hidden_cls
=
hidden_layers
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
# Create model based off of network a:
data_a
,
pooled_a
=
test_network_a
([
word_ids
,
mask
])
model_a
=
tf
.
keras
.
Model
([
word_ids
,
mask
],
[
data_a
,
pooled_a
])
# Create model based off of network b:
data_b
,
pooled_b
=
test_network_b
([
word_ids
,
mask
])
model_b
=
tf
.
keras
.
Model
([
word_ids
,
mask
],
[
data_b
,
pooled_b
])
# Create model based off of network b:
data_c
,
pooled_c
=
test_network_c
([
word_ids
,
mask
])
model_c
=
tf
.
keras
.
Model
([
word_ids
,
mask
],
[
data_c
,
pooled_c
])
batch_size
=
3
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
output_a
,
_
=
model_a
.
predict
([
word_id_data
,
mask_data
])
output_b
,
_
=
model_b
.
predict
([
word_id_data
,
mask_data
])
output_c
,
_
=
model_c
.
predict
([
word_id_data
,
mask_data
])
# Outputs from model a and b should be the same since they share the same
# embedding and hidden layers.
self
.
assertAllEqual
(
output_a
,
output_b
)
# Outputs from model a and c shouldn't be the same since they share the same
# embedding layer but different number of hidden layers.
self
.
assertNotAllEqual
(
output_a
,
output_c
)
@
parameterized
.
parameters
(
True
,
False
)
def
test_serialize_deserialize
(
self
,
use_hidden_cls_instance
):
hidden_size
=
32
...
...
official/nlp/modeling/networks/funnel_transformer.py
0 → 100644
View file @
ca552843
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Funnel Transformer network."""
# pylint: disable=g-classes-have-attributes
from
typing
import
Union
,
Collection
from
absl
import
logging
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
def
_pool_and_concat
(
data
,
unpool_length
:
int
,
stride
:
int
,
axes
:
Union
[
Collection
[
int
],
int
]):
"""Pools the data along a given axis with stride.
It also skips first unpool_length elements.
Args:
data: Tensor to be pooled.
unpool_length: Leading elements to be skipped.
stride: Stride for the given axis.
axes: Axes to pool the Tensor.
Returns:
Pooled and concatenated Tensor.
"""
# Wraps the axes as a list.
if
isinstance
(
axes
,
int
):
axes
=
[
axes
]
for
axis
in
axes
:
# Skips first `unpool_length` tokens.
unpool_tensor_shape
=
[
slice
(
None
)]
*
axis
+
[
slice
(
None
,
unpool_length
)]
unpool_tensor
=
data
[
unpool_tensor_shape
]
# Pools the second half.
pool_tensor_shape
=
[
slice
(
None
)]
*
axis
+
[
slice
(
unpool_length
,
None
,
stride
)
]
pool_tensor
=
data
[
pool_tensor_shape
]
data
=
tf
.
concat
((
unpool_tensor
,
pool_tensor
),
axis
=
axis
)
return
data
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
FunnelTransformerEncoder
(
tf
.
keras
.
layers
.
Layer
):
"""Funnel Transformer-based encoder network.
Funnel Transformer Implementation of https://arxiv.org/abs/2006.03236.
This implementation utilizes the base framework with Bert
(https://arxiv.org/abs/1810.04805).
Its output is compatible with `BertEncoder`.
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers within
the transformer layers.
pool_stride: Pooling stride to compress the sequence length.
unpool_length: Leading n tokens to be skipped from pooling.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to generate
embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
=
768
,
num_layers
=
12
,
num_attention_heads
=
12
,
max_sequence_length
=
512
,
type_vocab_size
=
16
,
inner_dim
=
3072
,
inner_activation
=
lambda
x
:
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
),
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
pool_stride
=
2
,
unpool_length
=
0
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
output_range
=
None
,
embedding_width
=
None
,
embedding_layer
=
None
,
norm_first
=
False
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
activation
=
tf
.
keras
.
activations
.
get
(
inner_activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
embedding_width
is
None
:
embedding_width
=
hidden_size
if
embedding_layer
is
None
:
self
.
_embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
name
=
'word_embeddings'
)
else
:
self
.
_embedding_layer
=
embedding_layer
self
.
_position_embedding_layer
=
keras_nlp
.
layers
.
PositionEmbedding
(
initializer
=
initializer
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
self
.
_type_embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
use_one_hot
=
True
,
name
=
'type_embeddings'
)
self
.
_embedding_norm_layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'embeddings/layer_norm'
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
self
.
_embedding_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
output_dropout
,
name
=
'embedding_dropout'
)
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
self
.
_embedding_projection
=
None
if
embedding_width
!=
hidden_size
:
self
.
_embedding_projection
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'...x,xy->...y'
,
output_shape
=
hidden_size
,
bias_axes
=
'y'
,
kernel_initializer
=
initializer
,
name
=
'embedding_projection'
)
self
.
_transformer_layers
=
[]
self
.
_attention_mask_layer
=
keras_nlp
.
layers
.
SelfAttentionMask
(
name
=
'self_attention_mask'
)
for
i
in
range
(
num_layers
):
layer
=
keras_nlp
.
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
inner_dim
,
inner_activation
=
inner_activation
,
output_dropout
=
output_dropout
,
attention_dropout
=
attention_dropout
,
norm_first
=
norm_first
,
output_range
=
output_range
if
i
==
num_layers
-
1
else
None
,
kernel_initializer
=
initializer
,
name
=
'transformer/layer_%d'
%
i
)
self
.
_transformer_layers
.
append
(
layer
)
self
.
_pooler_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
name
=
'pooler_transform'
)
self
.
_att_input_pool_layer
=
tf
.
keras
.
layers
.
MaxPooling1D
(
pool_size
=
pool_stride
,
strides
=
pool_stride
,
padding
=
'same'
,
name
=
'att_input_pool_layer'
)
self
.
_pool_stride
=
pool_stride
self
.
_unpool_length
=
unpool_length
self
.
_config
=
{
'vocab_size'
:
vocab_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'num_attention_heads'
:
num_attention_heads
,
'max_sequence_length'
:
max_sequence_length
,
'type_vocab_size'
:
type_vocab_size
,
'inner_dim'
:
inner_dim
,
'inner_activation'
:
tf
.
keras
.
activations
.
serialize
(
activation
),
'output_dropout'
:
output_dropout
,
'attention_dropout'
:
attention_dropout
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'output_range'
:
output_range
,
'embedding_width'
:
embedding_width
,
'embedding_layer'
:
embedding_layer
,
'norm_first'
:
norm_first
,
'pool_stride'
:
pool_stride
,
'unpool_length'
:
unpool_length
,
}
def
call
(
self
,
inputs
):
# inputs are [word_ids, mask, type_ids]
if
isinstance
(
inputs
,
(
list
,
tuple
)):
logging
.
warning
(
'List inputs to %s are discouraged.'
,
self
.
__class__
)
if
len
(
inputs
)
==
3
:
word_ids
,
mask
,
type_ids
=
inputs
else
:
raise
ValueError
(
'Unexpected inputs to %s with length at %d.'
%
(
self
.
__class__
,
len
(
inputs
)))
elif
isinstance
(
inputs
,
dict
):
word_ids
=
inputs
.
get
(
'input_word_ids'
)
mask
=
inputs
.
get
(
'input_mask'
)
type_ids
=
inputs
.
get
(
'input_type_ids'
)
else
:
raise
ValueError
(
'Unexpected inputs type to %s.'
%
self
.
__class__
)
word_embeddings
=
self
.
_embedding_layer
(
word_ids
)
# absolute position embeddings
position_embeddings
=
self
.
_position_embedding_layer
(
word_embeddings
)
type_embeddings
=
self
.
_type_embedding_layer
(
type_ids
)
embeddings
=
tf
.
keras
.
layers
.
add
(
[
word_embeddings
,
position_embeddings
,
type_embeddings
])
embeddings
=
self
.
_embedding_norm_layer
(
embeddings
)
embeddings
=
self
.
_embedding_dropout
(
embeddings
)
if
self
.
_embedding_projection
is
not
None
:
embeddings
=
self
.
_embedding_projection
(
embeddings
)
attention_mask
=
self
.
_attention_mask_layer
(
embeddings
,
mask
)
encoder_outputs
=
[]
x
=
embeddings
# TODO(b/195972228): attention_mask can be co-generated with pooling.
attention_mask
=
_pool_and_concat
(
attention_mask
,
unpool_length
=
self
.
_unpool_length
,
stride
=
self
.
_pool_stride
,
axes
=
[
1
])
for
layer
in
self
.
_transformer_layers
:
# Pools layer for compressing the query length.
pooled_inputs
=
self
.
_att_input_pool_layer
(
x
[:,
self
.
_unpool_length
:,
:])
query_inputs
=
tf
.
concat
(
values
=
(
tf
.
cast
(
x
[:,
:
self
.
_unpool_length
,
:],
dtype
=
pooled_inputs
.
dtype
),
pooled_inputs
),
axis
=
1
)
x
=
layer
([
query_inputs
,
x
,
attention_mask
])
# Pools the corresponding attention_mask.
attention_mask
=
_pool_and_concat
(
attention_mask
,
unpool_length
=
self
.
_unpool_length
,
stride
=
self
.
_pool_stride
,
axes
=
[
1
,
2
])
encoder_outputs
.
append
(
x
)
last_encoder_output
=
encoder_outputs
[
-
1
]
first_token_tensor
=
last_encoder_output
[:,
0
,
:]
pooled_output
=
self
.
_pooler_layer
(
first_token_tensor
)
return
dict
(
sequence_output
=
encoder_outputs
[
-
1
],
pooled_output
=
pooled_output
,
encoder_outputs
=
encoder_outputs
)
def
get_embedding_table
(
self
):
return
self
.
_embedding_layer
.
embeddings
def
get_embedding_layer
(
self
):
return
self
.
_embedding_layer
def
get_config
(
self
):
return
dict
(
self
.
_config
)
@
property
def
transformer_layers
(
self
):
"""List of Transformer layers in the encoder."""
return
self
.
_transformer_layers
@
property
def
pooler_layer
(
self
):
"""The pooler dense layer after the transformer layers."""
return
self
.
_pooler_layer
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
if
'embedding_layer'
in
config
and
config
[
'embedding_layer'
]
is
not
None
:
warn_string
=
(
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.'
)
print
(
'WARNING: '
+
warn_string
)
logging
.
warn
(
warn_string
)
return
cls
(
**
config
)
official/nlp/modeling/networks/funnel_transformer_test.py
0 → 100644
View file @
ca552843
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for transformer-based bert encoder network."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.modeling.networks
import
funnel_transformer
class
SingleLayerModel
(
tf
.
keras
.
Model
):
def
__init__
(
self
,
layer
):
super
().
__init__
()
self
.
layer
=
layer
def
call
(
self
,
inputs
):
return
self
.
layer
(
inputs
)
class
FunnelTransformerEncoderTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
def
tearDown
(
self
):
super
(
FunnelTransformerEncoderTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
set_global_policy
(
"float32"
)
@
parameterized
.
named_parameters
((
"mix"
,
"mixed_float16"
,
tf
.
float16
),
(
"float32"
,
"float32"
,
tf
.
float32
))
def
test_network_creation
(
self
,
policy
,
pooled_dtype
):
tf
.
keras
.
mixed_precision
.
set_global_policy
(
policy
)
hidden_size
=
32
sequence_length
=
21
pool_stride
=
2
num_layers
=
3
# Create a small FunnelTransformerEncoder for testing.
test_network
=
funnel_transformer
.
FunnelTransformerEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
num_layers
,
pool_stride
=
pool_stride
,
unpool_length
=
0
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
self
.
assertIsInstance
(
test_network
.
transformer_layers
,
list
)
self
.
assertLen
(
test_network
.
transformer_layers
,
num_layers
)
self
.
assertIsInstance
(
test_network
.
pooler_layer
,
tf
.
keras
.
layers
.
Dense
)
# Stride=2 compresses sequence length to half the size at each layer.
# This configuration gives each layer of seq length: 21->11->6->3.
expected_data_shape
=
[
None
,
3
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
pooled_dtype
,
pooled
.
dtype
)
@
parameterized
.
named_parameters
(
(
"no_stride_no_unpool"
,
1
,
0
),
(
"large_stride_with_unpool"
,
3
,
1
),
(
"large_stride_with_large_unpool"
,
5
,
10
),
(
"no_stride_with_unpool"
,
1
,
1
),
)
def
test_all_encoder_outputs_network_creation
(
self
,
pool_stride
,
unpool_length
):
hidden_size
=
32
sequence_length
=
21
num_layers
=
3
# Create a small FunnelTransformerEncoder for testing.
test_network
=
funnel_transformer
.
FunnelTransformerEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
num_layers
,
pool_stride
=
pool_stride
,
unpool_length
=
unpool_length
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
all_encoder_outputs
=
dict_outputs
[
"encoder_outputs"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertLen
(
all_encoder_outputs
,
num_layers
)
for
data
in
all_encoder_outputs
:
expected_data_shape
[
1
]
=
unpool_length
+
(
expected_data_shape
[
1
]
+
pool_stride
-
1
-
unpool_length
)
//
pool_stride
print
(
"shapes:"
,
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
all_encoder_outputs
[
-
1
].
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
@
parameterized
.
named_parameters
(
(
"all_sequence"
,
None
,
3
,
0
),
(
"output_range"
,
1
,
1
,
0
),
(
"all_sequence_wit_unpool"
,
None
,
4
,
1
),
(
"output_range_with_unpool"
,
1
,
1
,
1
),
(
"output_range_with_large_unpool"
,
1
,
1
,
2
),
)
def
test_network_invocation
(
self
,
output_range
,
out_seq_len
,
unpool_length
):
hidden_size
=
32
sequence_length
=
21
vocab_size
=
57
num_types
=
7
pool_stride
=
2
# Create a small FunnelTransformerEncoder for testing.
test_network
=
funnel_transformer
.
FunnelTransformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
output_range
=
output_range
,
pool_stride
=
pool_stride
,
unpool_length
=
unpool_length
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
# Create a model based off of this network:
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size
=
3
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
type_id_data
=
np
.
random
.
randint
(
num_types
,
size
=
(
batch_size
,
sequence_length
))
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
out_seq_len
)
# output_range
# Creates a FunnelTransformerEncoder with max_sequence_length !=
# sequence_length
max_sequence_length
=
128
test_network
=
funnel_transformer
.
FunnelTransformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
pool_stride
=
pool_stride
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
3
)
# Creates a FunnelTransformerEncoder with embedding_width != hidden_size
test_network
=
funnel_transformer
.
FunnelTransformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
embedding_width
=
16
,
pool_stride
=
pool_stride
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
-
1
],
hidden_size
)
self
.
assertTrue
(
hasattr
(
test_network
,
"_embedding_projection"
))
def
test_serialize_deserialize
(
self
):
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
hidden_size
=
32
,
num_layers
=
3
,
num_attention_heads
=
2
,
max_sequence_length
=
21
,
type_vocab_size
=
12
,
inner_dim
=
1223
,
inner_activation
=
"relu"
,
output_dropout
=
0.05
,
attention_dropout
=
0.22
,
initializer
=
"glorot_uniform"
,
output_range
=-
1
,
embedding_width
=
16
,
embedding_layer
=
None
,
norm_first
=
False
,
pool_stride
=
2
,
unpool_length
=
0
)
network
=
funnel_transformer
.
FunnelTransformerEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"inner_activation"
]
=
tf
.
keras
.
activations
.
serialize
(
tf
.
keras
.
activations
.
get
(
expected_config
[
"inner_activation"
]))
expected_config
[
"initializer"
]
=
tf
.
keras
.
initializers
.
serialize
(
tf
.
keras
.
initializers
.
get
(
expected_config
[
"initializer"
]))
self
.
assertEqual
(
network
.
get_config
(),
expected_config
)
# Create another network object from the first object's config.
new_network
=
funnel_transformer
.
FunnelTransformerEncoder
.
from_config
(
network
.
get_config
())
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
network
.
get_config
(),
new_network
.
get_config
())
# Tests model saving/loading.
model_path
=
self
.
get_temp_dir
()
+
"/model"
network_wrapper
=
SingleLayerModel
(
network
)
# One forward-path to ensure input_shape.
batch_size
=
3
sequence_length
=
21
vocab_size
=
100
num_types
=
12
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
type_id_data
=
np
.
random
.
randint
(
num_types
,
size
=
(
batch_size
,
sequence_length
))
_
=
network_wrapper
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
network_wrapper
.
save
(
model_path
)
_
=
tf
.
keras
.
models
.
load_model
(
model_path
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
Prev
1
2
3
4
5
6
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment