Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
23db25e9
Commit
23db25e9
authored
May 17, 2021
by
Frederick Liu
Committed by
A. Unique TensorFlower
May 17, 2021
Browse files
[efficient] Promote bigbird to modeling/layers.
PiperOrigin-RevId: 374267447
parent
856622d3
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
23 additions
and
15 deletions
+23
-15
official/nlp/configs/encoders.py
official/nlp/configs/encoders.py
+2
-3
official/nlp/modeling/layers/README.md
official/nlp/modeling/layers/README.md
+11
-7
official/nlp/modeling/layers/__init__.py
official/nlp/modeling/layers/__init__.py
+2
-0
official/nlp/modeling/layers/bigbird_attention.py
official/nlp/modeling/layers/bigbird_attention.py
+0
-0
official/nlp/modeling/layers/bigbird_attention_test.py
official/nlp/modeling/layers/bigbird_attention_test.py
+1
-1
official/nlp/projects/bigbird/encoder.py
official/nlp/projects/bigbird/encoder.py
+7
-4
No files found.
official/nlp/configs/encoders.py
View file @
23db25e9
...
...
@@ -26,7 +26,6 @@ from official.modeling import hyperparams
from
official.modeling
import
tf_utils
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
networks
from
official.nlp.projects.bigbird
import
attention
as
bigbird_attention
@
dataclasses
.
dataclass
...
...
@@ -301,14 +300,14 @@ def build_encoder(config: EncoderConfig,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
attention_cls
=
bigbird_attention
.
BigBirdAttention
,
attention_cls
=
layers
.
BigBirdAttention
,
attention_cfg
=
attention_cfg
)
kwargs
=
dict
(
embedding_cfg
=
embedding_cfg
,
hidden_cls
=
layers
.
TransformerScaffold
,
hidden_cfg
=
hidden_cfg
,
num_hidden_instances
=
encoder_cfg
.
num_layers
,
mask_cls
=
bigbird_attention
.
BigBirdMasks
,
mask_cls
=
layers
.
BigBirdMasks
,
mask_cfg
=
dict
(
block_size
=
encoder_cfg
.
block_size
),
pooled_output_dim
=
encoder_cfg
.
hidden_size
,
pooler_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
...
...
official/nlp/modeling/layers/README.md
View file @
23db25e9
...
...
@@ -8,6 +8,10 @@ assemble new `tf.keras` layers or models.
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
. If
`from_tensor`
and
`to_tensor`
are the same, then this is self-attention.
*
[
BigBirdAttention
](
bigbird_attention.py
)
implements a sparse attention
mechanism that reduces this quadratic dependency to linear described in
[
"Big Bird: Transformers for Longer Sequences"
](
https://arxiv.org/abs/2007.14062
)
.
*
[
CachedAttention
](
attention.py
)
implements an attention layer with cache
used for auto-agressive decoding.
...
...
@@ -80,20 +84,20 @@ assemble new `tf.keras` layers or models.
*
[
MultiHeadRelativeAttention
](
relative_attention.py
)
implements a variant
of multi-head attention with support for relative position encodings as
described in "Transformer-XL: Attentive Language Models Beyond a
Fixed-Length Context"(https://arxiv.org/abs/1901.02860). This also has
described in
[
"Transformer-XL: Attentive Language Models Beyond a
Fixed-Length Context"
]
(
https://arxiv.org/abs/1901.02860
)
. This also has
extended support for segment-based attention, a re-parameterization
introduced in "XLNet: Generalized Autoregressive Pretraining for Language
Understanding"
(https://arxiv.org/abs/1906.08237).
introduced in
[
"XLNet: Generalized Autoregressive Pretraining for Language
Understanding"
]
(
https://arxiv.org/abs/1906.08237
)
.
*
[
TwoStreamRelativeAttention
](
relative_attention.py
)
implements a variant
of multi-head relative attention as described in "XLNet: Generalized
Autoregressive Pretraining for Language Understanding"
of multi-head relative attention as described in
[
"XLNet: Generalized
Autoregressive Pretraining for Language Understanding"
]
(https://arxiv.org/abs/1906.08237). This takes in a query and content
stream and applies self attention.
*
[
TransformerXL
](
transformer_xl.py
)
implements Transformer XL introduced in
"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
[
"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
]
(https://arxiv.org/abs/1901.02860). This contains
`TransformerXLBlock`
, a
block containing either one or two stream relative self-attention as well as
subsequent feedforward networks. It also contains
`TransformerXL`
, which
...
...
official/nlp/modeling/layers/__init__.py
View file @
23db25e9
...
...
@@ -18,6 +18,8 @@ They can be used to assemble new `tf.keras` layers or models.
"""
# pylint: disable=wildcard-import
from
official.nlp.modeling.layers.attention
import
*
from
official.nlp.modeling.layers.bigbird_attention
import
BigBirdAttention
from
official.nlp.modeling.layers.bigbird_attention
import
BigBirdMasks
from
official.nlp.modeling.layers.cls_head
import
*
from
official.nlp.modeling.layers.dense_einsum
import
DenseEinsum
from
official.nlp.modeling.layers.gated_feedforward
import
GatedFeedforward
...
...
official/nlp/
project
s/bigbird
/
attention.py
→
official/nlp/
modeling/layer
s/bigbird
_
attention.py
View file @
23db25e9
File moved
official/nlp/
project
s/bigbird
/
attention_test.py
→
official/nlp/
modeling/layer
s/bigbird
_
attention_test.py
View file @
23db25e9
...
...
@@ -16,7 +16,7 @@
import
tensorflow
as
tf
from
official.nlp.
projects.bigbird
import
attention
from
official.nlp.
modeling.layers
import
bigbird_attention
as
attention
class
BigbirdAttentionTest
(
tf
.
test
.
TestCase
):
...
...
official/nlp/projects/bigbird/encoder.py
View file @
23db25e9
...
...
@@ -20,11 +20,13 @@ import tensorflow as tf
from
official.modeling
import
activations
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.projects.bigbird
import
attention
from
official.nlp.projects.bigbird
import
recompute_grad
from
official.nlp.projects.bigbird
import
recomputing_dropout
_MAX_SEQ_LEN
=
4096
class
RecomputeTransformerLayer
(
layers
.
TransformerScaffold
):
"""Transformer layer that recomputes the forward pass during backpropagation."""
...
...
@@ -86,7 +88,7 @@ class BigBirdEncoder(tf.keras.Model):
hidden_size
=
768
,
num_layers
=
12
,
num_attention_heads
=
12
,
max_position_embeddings
=
attention
.
MAX_SEQ_LEN
,
max_position_embeddings
=
_
MAX_SEQ_LEN
,
type_vocab_size
=
16
,
intermediate_size
=
3072
,
block_size
=
64
,
...
...
@@ -177,7 +179,8 @@ class BigBirdEncoder(tf.keras.Model):
self
.
_transformer_layers
=
[]
data
=
embeddings
masks
=
attention
.
BigBirdMasks
(
block_size
=
block_size
)(
data
,
mask
)
masks
=
layers
.
BigBirdMasks
(
block_size
=
block_size
)(
data
,
mask
)
encoder_outputs
=
[]
attn_head_dim
=
hidden_size
//
num_attention_heads
for
i
in
range
(
num_layers
):
...
...
@@ -185,7 +188,7 @@ class BigBirdEncoder(tf.keras.Model):
num_attention_heads
,
intermediate_size
,
activation
,
attention_cls
=
attention
.
BigBirdAttention
,
attention_cls
=
layers
.
BigBirdAttention
,
attention_cfg
=
dict
(
num_heads
=
num_attention_heads
,
key_dim
=
attn_head_dim
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment