Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
96cbd362
Commit
96cbd362
authored
Jul 14, 2022
by
A. Unique TensorFlower
Browse files
Internal change
PiperOrigin-RevId: 461063108
parent
2c4ea3d8
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
189 additions
and
11 deletions
+189
-11
official/nlp/modeling/layers/__init__.py
official/nlp/modeling/layers/__init__.py
+2
-0
official/nlp/modeling/layers/pack_optimization.py
official/nlp/modeling/layers/pack_optimization.py
+14
-4
official/nlp/modeling/layers/per_dim_scale_attention.py
official/nlp/modeling/layers/per_dim_scale_attention.py
+98
-0
official/nlp/modeling/layers/per_dim_scale_attention_test.py
official/nlp/modeling/layers/per_dim_scale_attention_test.py
+52
-0
official/nlp/modeling/layers/transformer_scaffold.py
official/nlp/modeling/layers/transformer_scaffold.py
+23
-7
No files found.
official/nlp/modeling/layers/__init__.py
View file @
96cbd362
...
...
@@ -36,6 +36,8 @@ from official.nlp.modeling.layers.multi_channel_attention import *
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.pack_optimization
import
PackBertEmbeddings
from
official.nlp.modeling.layers.pack_optimization
import
StridedTransformerEncoderBlock
from
official.nlp.modeling.layers.pack_optimization
import
StridedTransformerScaffold
from
official.nlp.modeling.layers.per_dim_scale_attention
import
PerDimScaleAttention
from
official.nlp.modeling.layers.position_embedding
import
PositionEmbedding
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionBias
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionEmbedding
...
...
official/nlp/modeling/layers/pack_optimization.py
View file @
96cbd362
...
...
@@ -202,10 +202,20 @@ class StridedTransformerScaffold(transformer_scaffold.TransformerScaffold):
"""TransformerScaffold for packing optimization to stride over inputs."""
def
call
(
self
,
inputs
,
stride
:
tf
.
Tensor
,
training
=
None
):
if
isinstance
(
inputs
,
(
list
,
tuple
))
and
len
(
inputs
)
==
2
:
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
key_value
=
None
elif
len
(
inputs
)
==
3
:
input_tensor
,
key_value
,
attention_mask
=
inputs
else
:
raise
ValueError
(
'Unexpected inputs to %s with length at %d'
%
(
self
.
__class__
,
len
(
inputs
)))
else
:
input_tensor
,
attention_mask
=
(
inputs
,
None
)
input_tensor
,
key_value
,
attention_mask
=
(
inputs
,
None
,
None
)
if
key_value
is
None
:
key_value
=
input_tensor
if
self
.
_norm_first
:
source_tensor
=
input_tensor
[:,
::
stride
,
:]
...
...
@@ -215,7 +225,7 @@ class StridedTransformerScaffold(transformer_scaffold.TransformerScaffold):
target_tensor
=
input_tensor
[:,
::
stride
,
:]
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
,
query
=
target_tensor
,
value
=
key_value
,
attention_mask
=
attention_mask
,
training
=
training
)
attention_output
=
self
.
_attention_dropout
(
attention_output
,
training
=
training
)
...
...
official/nlp/modeling/layers/per_dim_scale_attention.py
0 → 100644
View file @
96cbd362
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based attention layer with learnable per dim scaling."""
import
numpy
as
np
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
PerDimScaleAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
"""Learn scales for individual dims.
It can improve quality but might hurt training stability.
"""
def
_build_from_signature
(
self
,
query
,
value
,
key
=
None
):
super
().
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
# pytype: disable=attribute-error
self
.
_scale_dim
=
self
.
_key_dim
self
.
per_dim_scale
=
self
.
add_weight
(
name
=
'per_dim_scale'
,
shape
=
(
self
.
_scale_dim
,),
initializer
=
'zeros'
,
dtype
=
self
.
dtype
,
trainable
=
True
)
def
_scale_query
(
self
,
query
):
# 1.0/tf.nn.softplus(0.0) = 1.442695041. Hard code this number so that we
# can avoid unnecessary XLA op fusion mess on TPU.
r_softplus_0
=
1.442695041
scale
=
tf
.
constant
(
r_softplus_0
/
np
.
sqrt
(
float
(
self
.
_scale_dim
)),
dtype
=
query
.
dtype
)
scale
*=
tf
.
nn
.
softplus
(
self
.
per_dim_scale
)
return
query
*
scale
def
_compute_attention
(
self
,
query
,
key
,
value
,
attention_mask
=
None
,
training
=
None
):
query
=
self
.
_scale_query
(
query
)
attention_scores
=
tf
.
einsum
(
self
.
_dot_product_equation
,
key
,
query
)
attention_scores
=
self
.
_masked_softmax
(
attention_scores
,
attention_mask
)
attention_scores_dropout
=
self
.
_dropout_layer
(
attention_scores
,
training
=
training
)
# `context_layer` = [B, T, N, H]
attention_output
=
tf
.
einsum
(
self
.
_combine_equation
,
attention_scores_dropout
,
value
)
return
attention_output
,
attention_scores
def
call
(
self
,
query
,
value
,
key
=
None
,
attention_mask
=
None
,
return_attention_scores
=
False
,
training
=
None
,
):
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
if
key
is
None
:
key
=
value
# N = `num_attention_heads`
# H = `size_per_head`
# `query` = [B, T, N ,H]
query
=
self
.
_query_dense
(
query
)
# `key` = [B, S, N, H]
key
=
self
.
_key_dense
(
key
)
# `value` = [B, S, N, H]
value
=
self
.
_value_dense
(
value
)
attention_output
,
attention_scores
=
self
.
_compute_attention
(
query
,
key
,
value
,
attention_mask
,
training
)
attention_output
=
self
.
_output_dense
(
attention_output
)
if
return_attention_scores
:
return
attention_output
,
attention_scores
return
attention_output
official/nlp/modeling/layers/per_dim_scale_attention_test.py
0 → 100644
View file @
96cbd362
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for PerDimScaleAttention."""
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
per_dim_scale_attention
as
attention
class
PerDimScaleAttentionTest
(
tf
.
test
.
TestCase
):
def
test_attention
(
self
):
num_heads
=
12
key_dim
=
64
seq_length
=
1024
batch_size
=
2
test_layer
=
attention
.
PerDimScaleAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
)
query
=
tf
.
random
.
normal
(
shape
=
(
batch_size
,
seq_length
,
key_dim
*
num_heads
))
value
=
query
output
=
test_layer
(
query
=
query
,
value
=
value
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
*
num_heads
])
def
test_config
(
self
):
num_heads
=
12
key_dim
=
64
test_layer
=
attention
.
PerDimScaleAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
)
print
(
test_layer
.
get_config
())
new_layer
=
attention
.
PerDimScaleAttention
.
from_config
(
test_layer
.
get_config
())
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
test_layer
.
get_config
(),
new_layer
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/transformer_scaffold.py
View file @
96cbd362
...
...
@@ -119,9 +119,15 @@ class TransformerScaffold(tf.keras.layers.Layer):
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
def
build
(
self
,
input_shape
):
input_tensor_shape
=
input_shape
[
0
]
if
(
len
(
input_shape
)
==
2
)
else
input_shape
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor_shape
)
if
isinstance
(
input_shape
,
tf
.
TensorShape
):
input_tensor_shape
=
input_shape
elif
isinstance
(
input_shape
,
(
list
,
tuple
)):
input_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
0
])
else
:
raise
ValueError
(
"The type of input shape argument is not supported, got: %s"
%
type
(
input_shape
))
if
len
(
input_tensor_shape
.
as_list
())
!=
3
:
raise
ValueError
(
"TransformerScaffold expects a three-dimensional input of "
...
...
@@ -271,17 +277,27 @@ class TransformerScaffold(tf.keras.layers.Layer):
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
,
training
=
None
):
if
isinstance
(
inputs
,
(
list
,
tuple
))
and
len
(
inputs
)
==
2
:
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
key_value
=
None
elif
len
(
inputs
)
==
3
:
input_tensor
,
key_value
,
attention_mask
=
inputs
else
:
input_tensor
,
attention_mask
=
(
inputs
,
None
)
raise
ValueError
(
"Unexpected inputs to %s with length at %d"
%
(
self
.
__class__
,
len
(
inputs
)))
else
:
input_tensor
,
key_value
,
attention_mask
=
(
inputs
,
None
,
None
)
if
key_value
is
None
:
key_value
=
input_tensor
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
,
training
=
training
)
attention_output
=
self
.
_attention_layer
(
query
=
input_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
,
query
=
input_tensor
,
value
=
key_value
,
attention_mask
=
attention_mask
,
training
=
training
)
attention_output
=
self
.
_attention_dropout
(
attention_output
,
training
=
training
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment