Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
32e4ca51
Commit
32e4ca51
authored
Nov 28, 2023
by
qianyj
Browse files
Update code to v2.11.0
parents
9485aa1d
71060f67
Changes
775
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
453 additions
and
87 deletions
+453
-87
official/nlp/modeling/models/bert_pretrainer.py
official/nlp/modeling/models/bert_pretrainer.py
+5
-3
official/nlp/modeling/models/bert_pretrainer_test.py
official/nlp/modeling/models/bert_pretrainer_test.py
+1
-1
official/nlp/modeling/models/bert_span_labeler.py
official/nlp/modeling/models/bert_span_labeler.py
+1
-1
official/nlp/modeling/models/bert_span_labeler_test.py
official/nlp/modeling/models/bert_span_labeler_test.py
+1
-1
official/nlp/modeling/models/bert_token_classifier.py
official/nlp/modeling/models/bert_token_classifier.py
+1
-1
official/nlp/modeling/models/bert_token_classifier_test.py
official/nlp/modeling/models/bert_token_classifier_test.py
+1
-1
official/nlp/modeling/models/dual_encoder.py
official/nlp/modeling/models/dual_encoder.py
+1
-1
official/nlp/modeling/models/dual_encoder_test.py
official/nlp/modeling/models/dual_encoder_test.py
+1
-1
official/nlp/modeling/models/electra_pretrainer.py
official/nlp/modeling/models/electra_pretrainer.py
+6
-5
official/nlp/modeling/models/electra_pretrainer_test.py
official/nlp/modeling/models/electra_pretrainer_test.py
+1
-1
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+1
-1
official/nlp/modeling/models/seq2seq_transformer_test.py
official/nlp/modeling/models/seq2seq_transformer_test.py
+1
-1
official/nlp/modeling/models/t5.py
official/nlp/modeling/models/t5.py
+197
-53
official/nlp/modeling/models/t5_test.py
official/nlp/modeling/models/t5_test.py
+215
-3
official/nlp/modeling/models/xlnet.py
official/nlp/modeling/models/xlnet.py
+1
-1
official/nlp/modeling/models/xlnet_test.py
official/nlp/modeling/models/xlnet_test.py
+1
-1
official/nlp/modeling/networks/README.md
official/nlp/modeling/networks/README.md
+5
-0
official/nlp/modeling/networks/__init__.py
official/nlp/modeling/networks/__init__.py
+2
-1
official/nlp/modeling/networks/albert_encoder.py
official/nlp/modeling/networks/albert_encoder.py
+10
-9
official/nlp/modeling/networks/albert_encoder_test.py
official/nlp/modeling/networks/albert_encoder_test.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
775 of 775+
files are displayed.
Plain diff
Email patch
official/nlp/modeling/models/bert_pretrainer.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -22,6 +22,7 @@ from absl import logging
import
gin
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
networks
...
...
@@ -102,7 +103,7 @@ class BertPretrainer(tf.keras.Model):
masked_lm
=
layers
.
MaskedLM
(
embedding_table
=
embedding_table
,
activation
=
activation
,
initializer
=
initializer
,
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
output
=
output
,
name
=
'cls/predictions'
)
lm_outputs
=
masked_lm
(
...
...
@@ -111,7 +112,7 @@ class BertPretrainer(tf.keras.Model):
classification
=
networks
.
Classification
(
input_width
=
cls_output
.
shape
[
-
1
],
num_classes
=
num_classes
,
initializer
=
initializer
,
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
output
=
output
,
name
=
'classification'
)
sentence_outputs
=
classification
(
cls_output
)
...
...
@@ -199,6 +200,7 @@ class BertPretrainerV2(tf.keras.Model):
self
.
_config
=
{
'encoder_network'
:
encoder_network
,
'mlm_initializer'
:
mlm_initializer
,
'mlm_activation'
:
mlm_activation
,
'classification_heads'
:
classification_heads
,
'name'
:
name
,
}
...
...
official/nlp/modeling/models/bert_pretrainer_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/bert_span_labeler.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/bert_span_labeler_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/bert_token_classifier.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/bert_token_classifier_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/dual_encoder.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/dual_encoder_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/electra_pretrainer.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -96,21 +96,22 @@ class ElectraPretrainer(tf.keras.Model):
self
.
masked_lm
=
layers
.
MaskedLM
(
embedding_table
=
generator_network
.
get_embedding_table
(),
activation
=
mlm_activation
,
initializer
=
mlm_initializer
,
initializer
=
tf_utils
.
clone_initializer
(
mlm_initializer
)
,
output
=
output_type
,
name
=
'generator_masked_lm'
)
self
.
classification
=
layers
.
ClassificationHead
(
inner_dim
=
generator_network
.
get_config
()[
'hidden_size'
],
num_classes
=
num_classes
,
initializer
=
mlm_initializer
,
initializer
=
tf_utils
.
clone_initializer
(
mlm_initializer
)
,
name
=
'generator_classification_head'
)
self
.
discriminator_projection
=
tf
.
keras
.
layers
.
Dense
(
units
=
discriminator_network
.
get_config
()[
'hidden_size'
],
activation
=
mlm_activation
,
kernel_initializer
=
mlm_initializer
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
mlm_initializer
)
,
name
=
'discriminator_projection_head'
)
self
.
discriminator_head
=
tf
.
keras
.
layers
.
Dense
(
units
=
1
,
kernel_initializer
=
mlm_initializer
)
units
=
1
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
mlm_initializer
))
def
call
(
self
,
inputs
):
"""ELECTRA forward pass.
...
...
official/nlp/modeling/models/electra_pretrainer_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/seq2seq_transformer.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/seq2seq_transformer_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/t5.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -55,6 +55,7 @@ class Module(tf.Module):
initializer
:
Initializer
,
dtype
:
tf
.
DType
=
tf
.
float32
,
**
kwargs
):
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
return
tf
.
Variable
(
initializer
(
shape
,
dtype
=
dtype
,
**
kwargs
),
name
=
name
)
def
read_variable
(
self
,
...
...
@@ -588,7 +589,8 @@ class MultiHeadAttention(Module):
init_std_rescaling
=
tf
.
math
.
sqrt
(
tf
.
cast
(
self
.
d_kv
,
dtype
=
self
.
dtype
))
query_w_init
=
(
lambda
*
args
,
**
kwargs
:
(
# pylint: disable=g-long-lambda
weight_initializer
(
*
args
,
**
kwargs
)
/
init_std_rescaling
))
tf_utils
.
clone_initializer
(
weight_initializer
)(
*
args
,
**
kwargs
)
/
init_std_rescaling
))
self
.
q
=
Linear3D
(
self
.
d_model
,
self
.
d_kv
,
...
...
@@ -1004,6 +1006,7 @@ class T5TransformerParams:
num_heads
:
int
d_ff
:
int
vocab_size
:
int
target_vocab_size
:
Optional
[
int
]
=
None
dropout_rate
:
float
=
0.0
layer_norm_epsilon
:
float
=
1e-6
shared_embedding
:
bool
=
False
...
...
@@ -1020,6 +1023,9 @@ class T5TransformerParams:
num_decoder_layers
:
Optional
[
int
]
=
None
one_hot_embedding
:
bool
=
True
layer_sharing
:
bool
=
False
# If true, uses one relative embedding for all encoder layers and one for all
# decoder layers. Otherwise, have relative embedding for each layer.
use_shared_relative_position_bias
:
bool
=
True
class
Encoder
(
Module
):
...
...
@@ -1048,6 +1054,7 @@ class Encoder(Module):
self
.
input_embed
=
shared_embedding
# Creates an alias to the input embed for encoder-only models.
self
.
word_embed
=
self
.
input_embed
if
config
.
use_shared_relative_position_bias
:
self
.
relative_embedding
=
RelativePositionEmbedding
(
num_heads
=
self
.
config
.
num_heads
,
relative_attention_num_buckets
=
self
.
config
...
...
@@ -1059,6 +1066,22 @@ class Encoder(Module):
dtype
=
self
.
dtype
,
compute_dtype
=
self
.
compute_dtype
,
name
=
"relative_posemb"
)
else
:
self
.
relative_embeddings
=
[]
for
layer_idx
in
range
(
self
.
config
.
num_layers
):
relative_embedding
=
RelativePositionEmbedding
(
num_heads
=
self
.
config
.
num_heads
,
relative_attention_num_buckets
=
self
.
config
.
relative_attention_num_buckets
,
relative_attention_max_distance
=
self
.
config
.
relative_attention_max_distance
,
bidirectional
=
self
.
config
.
bidirectional
,
embeddings_initializer
=
self
.
config
.
relative_embeddings_initializer
,
dtype
=
self
.
dtype
,
compute_dtype
=
self
.
compute_dtype
,
name
=
f
"relative_posemb_
{
layer_idx
}
"
)
self
.
relative_embeddings
.
append
(
relative_embedding
)
self
.
input_dropout
=
Dropout
(
self
.
config
.
dropout_rate
,)
self
.
encoder_layers
=
[]
for
layer_idx
in
range
(
self
.
config
.
num_layers
):
...
...
@@ -1086,12 +1109,38 @@ class Encoder(Module):
self
.
output_dropout
=
Dropout
(
self
.
config
.
dropout_rate
,)
@
tf
.
Module
.
with_name_scope
def
__call__
(
self
,
inputs
,
encoder_mask
=
None
,
training
=
False
):
def
get_relpos_bias
(
self
,
input_length
:
int
,
dense_inputs
:
tf
.
Tensor
,
layer_idx
:
Optional
[
int
]
=
None
)
->
tf
.
Tensor
:
if
self
.
config
.
use_shared_relative_position_bias
:
position_bias
=
self
.
relative_embedding
(
input_length
,
input_length
)
else
:
position_bias
=
self
.
relative_embeddings
[
layer_idx
](
input_length
,
input_length
)
if
dense_inputs
is
not
None
:
# Here we ignore relative position bias for dense embeddings.
# TODO(yejiayu): If we proceed to video use cases, rework this part.
dense_input_length
=
tf_utils
.
get_shape_list
(
dense_inputs
)[
1
]
# Position bias shape: [batch, 1, len, len]
paddings
=
tf
.
constant
([[
0
,
0
],
[
0
,
0
],
[
0
,
dense_input_length
],
[
0
,
dense_input_length
]])
position_bias
=
tf
.
pad
(
position_bias
,
paddings
,
"CONSTANT"
)
return
position_bias
@
tf
.
Module
.
with_name_scope
def
__call__
(
self
,
inputs
=
None
,
encoder_mask
=
None
,
dense_inputs
=
None
,
training
=
False
):
"""Applies Transformer model on the inputs.
Args:
inputs: input
data
inputs: input
word ids. Optional if dense data are provided.
encoder_mask: the encoder self-attention mask.
dense_inputs: dense input data. Concat after the embedding if word ids
are provided.
training: whether it is training pass, affecting dropouts.
Returns:
...
...
@@ -1101,14 +1150,26 @@ class Encoder(Module):
if
encoder_mask
is
not
None
:
encoder_mask
=
tf
.
cast
(
encoder_mask
,
self
.
compute_dtype
)
cfg
=
self
.
config
x
=
self
.
input_embed
(
inputs
,
one_hot
=
cfg
.
one_hot_embedding
)
inputs_array
=
[]
if
inputs
is
not
None
:
inputs_array
.
append
(
self
.
input_embed
(
inputs
,
one_hot
=
cfg
.
one_hot_embedding
))
if
dense_inputs
is
not
None
:
inputs_array
.
append
(
dense_inputs
)
if
not
inputs_array
:
raise
ValueError
(
"At least one of inputs and dense_inputs must not be "
"None."
)
x
=
tf
.
concat
(
inputs_array
,
axis
=
1
)
tensor_shape
=
tf_utils
.
get_shape_list
(
x
)
tensor_shape
[
-
2
]
=
1
x
=
self
.
input_dropout
(
x
,
noise_shape
=
tensor_shape
,
training
=
training
)
if
inputs
is
not
None
:
input_length
=
tf_utils
.
get_shape_list
(
inputs
)[
1
]
position_bias
=
self
.
relative_embedding
(
input_length
,
input_length
)
else
:
input_length
=
0
for
i
in
range
(
cfg
.
num_layers
):
position_bias
=
self
.
get_relpos_bias
(
input_length
,
dense_inputs
,
i
)
x
=
self
.
encoder_layers
[
i
](
x
,
attention_mask
=
encoder_mask
,
...
...
@@ -1133,11 +1194,15 @@ class Decoder(Module):
self
.
compute_dtype
=
compute_dtype
if
self
.
config
.
num_decoder_layers
is
None
:
self
.
config
.
num_decoder_layers
=
self
.
config
.
num_layers
if
not
hasattr
(
self
.
config
,
"target_vocab_size"
)
or
self
.
config
.
target_vocab_size
is
None
:
self
.
config
.
target_vocab_size
=
self
.
config
.
vocab_size
with
self
.
name_scope
:
# Target Embedding.
if
shared_embedding
is
None
:
self
.
target_embed
=
Embed
(
vocab_size
=
self
.
config
.
vocab_size
,
vocab_size
=
self
.
config
.
target_
vocab_size
,
features
=
self
.
config
.
d_model
,
embeddings_initializer
=
self
.
config
.
vocab_embeddings_initializer
,
dtype
=
self
.
dtype
,
...
...
@@ -1147,6 +1212,7 @@ class Decoder(Module):
self
.
target_embed
=
shared_embedding
self
.
target_dropout
=
Dropout
(
self
.
config
.
dropout_rate
,)
# Position bias for the target self attention.
if
config
.
use_shared_relative_position_bias
:
self
.
relative_embedding
=
RelativePositionEmbedding
(
num_heads
=
self
.
config
.
num_heads
,
relative_attention_num_buckets
=
self
.
config
...
...
@@ -1158,6 +1224,22 @@ class Decoder(Module):
dtype
=
self
.
dtype
,
compute_dtype
=
self
.
compute_dtype
,
name
=
"relative_posemb"
)
else
:
self
.
relative_embeddings
=
[]
for
layer_idx
in
range
(
self
.
config
.
num_decoder_layers
):
relative_embedding
=
RelativePositionEmbedding
(
num_heads
=
self
.
config
.
num_heads
,
relative_attention_num_buckets
=
self
.
config
.
relative_attention_num_buckets
,
relative_attention_max_distance
=
self
.
config
.
relative_attention_max_distance
,
bidirectional
=
self
.
config
.
bidirectional
,
embeddings_initializer
=
self
.
config
.
relative_embeddings_initializer
,
dtype
=
self
.
dtype
,
compute_dtype
=
self
.
compute_dtype
,
name
=
f
"relative_posemb_
{
layer_idx
}
"
)
self
.
relative_embeddings
.
append
(
relative_embedding
)
self
.
decoder_layers
=
[]
for
layer_idx
in
range
(
self
.
config
.
num_decoder_layers
):
if
self
.
config
.
layer_sharing
and
layer_idx
>
0
:
...
...
@@ -1185,11 +1267,18 @@ class Decoder(Module):
if
not
self
.
config
.
logits_via_embedding
:
self
.
logits_dense
=
Linear
(
in_features
=
self
.
config
.
d_model
,
out_features
=
self
.
config
.
vocab_size
,
out_features
=
self
.
config
.
target_
vocab_size
,
use_bias
=
False
,
dtype
=
self
.
dtype
,
name
=
"logits"
)
@
tf
.
Module
.
with_name_scope
def
get_relpos_bias
(
self
,
input_length
:
int
,
layer_idx
:
int
)
->
tf
.
Tensor
:
if
self
.
config
.
use_shared_relative_position_bias
:
return
self
.
relative_embedding
(
input_length
,
input_length
)
else
:
return
self
.
relative_embeddings
[
layer_idx
](
input_length
,
input_length
)
@
tf
.
Module
.
with_name_scope
def
__call__
(
self
,
decoder_input_tokens
,
...
...
@@ -1208,7 +1297,7 @@ class Decoder(Module):
encoded: the encoder outputs.
decoder_mask: the decoder self-attention mask.
encoder_decoder_mask: the cross-attention mask.
decode: Whether to perform auto
ag
gressive decoding.
decode: Whether to perform auto
re
gressive decoding.
decode_position: integer, the position to decode.
cache: The cache dictionary of key, value tensors.
max_decode_len: An optional integer specifying the maximum decoding
...
...
@@ -1217,7 +1306,10 @@ class Decoder(Module):
training: Whether it is training pass, affecting dropouts.
Returns:
output of a transformer encoder.
output of a transformer encoder including
1. logits: Logits for each word in the vocab.
2. raw_logits: Logits along the moded dimension.
3. cache: Used for decoding in inference mode.
"""
cfg
=
self
.
config
# Casts inputs to the dtype.
...
...
@@ -1230,12 +1322,14 @@ class Decoder(Module):
tensor_shape
=
tf_utils
.
get_shape_list
(
x
)
tensor_shape
[
-
2
]
=
1
x
=
self
.
target_dropout
(
x
,
noise_shape
=
tensor_shape
,
training
=
training
)
for
i
in
range
(
cfg
.
num_decoder_layers
):
if
cache
is
not
None
:
position_bias
=
self
.
relative_embedding
(
max_decode_len
,
max_decode_len
)
position_bias
=
self
.
get_relpos_bias
(
max_decode_len
,
i
)
else
:
input_length
=
tf_utils
.
get_shape_list
(
decoder_input_tokens
)[
1
]
position_bias
=
self
.
relative_embedding
(
input_length
,
i
nput_length
)
for
i
in
range
(
cfg
.
num_decoder_layers
):
position_bias
=
self
.
get_relpos_bias
(
input_length
,
i
)
if
cache
is
None
:
x
,
_
=
self
.
decoder_layers
[
i
](
x
,
...
...
@@ -1265,7 +1359,7 @@ class Decoder(Module):
logits
=
logits
/
math
.
sqrt
(
cfg
.
d_model
)
else
:
logits
=
self
.
logits_dense
(
output
)
return
logits
,
cache
return
dict
(
logits
=
logits
,
cache
=
cache
,
raw_logits
=
output
)
class
T5Transformer
(
Module
):
...
...
@@ -1306,33 +1400,72 @@ class T5Transformer(Module):
compute_dtype
=
self
.
compute_dtype
)
def
encode
(
self
,
encoder_input_tokens
,
encoder_input_tokens
=
None
,
encoder_segment_ids
=
None
,
encoder_dense_inputs
=
None
,
encoder_dense_segment_ids
=
None
,
training
=
False
):
eligible_positions
=
tf
.
cast
(
tf
.
not_equal
(
encoder_input_tokens
,
0
),
self
.
compute_dtype
)
eligible_position_array
=
[]
if
encoder_input_tokens
is
not
None
:
eligible_position_array
.
append
(
tf
.
cast
(
tf
.
not_equal
(
encoder_input_tokens
,
0
),
self
.
compute_dtype
))
if
encoder_dense_inputs
is
not
None
:
eligible_dense_positions
=
tf
.
cast
(
tf
.
reduce_any
(
tf
.
not_equal
(
encoder_dense_inputs
,
0
),
axis
=-
1
),
self
.
compute_dtype
)
eligible_position_array
.
append
(
eligible_dense_positions
)
if
not
eligible_position_array
:
raise
ValueError
(
"At least one of encoder_input_tokens and"
" encoder_dense_inputs must be provided."
)
eligible_positions
=
tf
.
concat
(
eligible_position_array
,
axis
=
1
)
encoder_mask
=
make_attention_mask
(
eligible_positions
,
eligible_positions
,
dtype
=
tf
.
bool
)
encoder_segment_id_array
=
[]
if
encoder_segment_ids
is
not
None
:
encoder_segment_id_array
.
append
(
encoder_segment_ids
)
if
encoder_dense_segment_ids
is
not
None
:
encoder_segment_id_array
.
append
(
encoder_dense_segment_ids
)
if
encoder_segment_id_array
:
encoder_segment_ids
=
tf
.
concat
(
encoder_segment_id_array
,
axis
=
1
)
segment_mask
=
make_attention_mask
(
encoder_segment_ids
,
encoder_segment_ids
,
tf
.
equal
,
dtype
=
tf
.
bool
)
encoder_mask
=
tf
.
math
.
logical_and
(
encoder_mask
,
segment_mask
)
encoder_mask
=
(
1.0
-
tf
.
cast
(
encoder_mask
,
self
.
compute_dtype
))
*
-
1e9
return
self
.
encoder
(
encoder_input_tokens
,
encoder_mask
,
training
=
training
)
return
self
.
encoder
(
encoder_input_tokens
,
encoder_mask
,
encoder_dense_inputs
,
training
=
training
)
def
decode
(
self
,
encoded
,
decoder_target_tokens
,
encoder_input_tokens
,
# only used for masks
encoder_input_tokens
=
None
,
# only used for masks
encoder_dense_inputs
=
None
,
decoder_input_tokens
=
None
,
encoder_segment_ids
=
None
,
encoder_dense_segment_ids
=
None
,
decoder_segment_ids
=
None
,
decode_position
=
None
,
cache
=
None
,
max_decode_len
=
None
,
decode
=
False
,
training
=
False
):
training
=
False
)
->
Dict
[
str
,
tf
.
Tensor
]:
eligible_inputs_array
=
[]
if
encoder_input_tokens
is
not
None
:
eligible_inputs
=
tf
.
cast
(
tf
.
not_equal
(
encoder_input_tokens
,
0
),
self
.
compute_dtype
)
eligible_inputs_array
.
append
(
eligible_inputs
)
if
encoder_dense_inputs
is
not
None
:
eligible_dense_inputs
=
tf
.
cast
(
tf
.
reduce_any
(
tf
.
not_equal
(
encoder_dense_inputs
,
0
),
axis
=-
1
),
self
.
compute_dtype
)
eligible_inputs_array
.
append
(
eligible_dense_inputs
)
eligible_inputs
=
tf
.
concat
(
eligible_inputs_array
,
axis
=
1
)
if
decode
:
# For decoding, the decoder_input_tokens is the decoder_target_tokens.
decoder_input_tokens
=
decoder_target_tokens
...
...
@@ -1342,14 +1475,12 @@ class T5Transformer(Module):
tf
.
cast
(
tf
.
not_equal
(
tf
.
ones_like
(
decoder_target_tokens
),
0
),
self
.
compute_dtype
),
tf
.
cast
(
tf
.
not_equal
(
encoder_input_tokens
,
0
),
self
.
compute_dtype
)
,
eligible_inputs
,
dtype
=
tf
.
bool
)
else
:
# Note that, masks should be created using decoder_target_tokens.
eligible_targets
=
tf
.
cast
(
tf
.
not_equal
(
decoder_target_tokens
,
0
),
self
.
compute_dtype
)
eligible_inputs
=
tf
.
cast
(
tf
.
not_equal
(
encoder_input_tokens
,
0
),
self
.
compute_dtype
)
decoder_mask
=
tf
.
math
.
logical_and
(
make_attention_mask
(
eligible_targets
,
eligible_targets
,
dtype
=
tf
.
bool
),
...
...
@@ -1365,6 +1496,9 @@ class T5Transformer(Module):
decoder_segment_ids
,
tf
.
equal
,
dtype
=
tf
.
bool
))
if
encoder_dense_segment_ids
is
not
None
:
encoder_segment_ids
=
tf
.
concat
(
[
encoder_segment_ids
,
encoder_dense_segment_ids
],
axis
=
1
)
encoder_decoder_mask
=
tf
.
math
.
logical_and
(
encoder_decoder_mask
,
make_attention_mask
(
...
...
@@ -1376,7 +1510,7 @@ class T5Transformer(Module):
decoder_mask
=
(
1.0
-
tf
.
cast
(
decoder_mask
,
self
.
compute_dtype
))
*
-
1e9
encoder_decoder_mask
=
(
1.0
-
tf
.
cast
(
encoder_decoder_mask
,
self
.
compute_dtype
))
*
-
1e9
logits
,
cache
=
self
.
decoder
(
outputs
=
self
.
decoder
(
decoder_input_tokens
,
encoded
,
decode_position
=
decode_position
,
...
...
@@ -1386,12 +1520,15 @@ class T5Transformer(Module):
max_decode_len
=
max_decode_len
,
decode
=
decode
,
training
=
training
)
return
dict
(
logits
=
logits
,
encoded
=
encoded
,
cache
=
cache
)
outputs
[
"encoded"
]
=
encoded
return
outputs
@
tf
.
Module
.
with_name_scope
def
__call__
(
self
,
encoder_input_tokens
,
decoder_target_tokens
,
encoder_input_tokens
=
None
,
decoder_target_tokens
=
None
,
encoder_dense_inputs
=
None
,
encoder_dense_segment_ids
=
None
,
decoder_input_tokens
=
None
,
encoder_segment_ids
=
None
,
decoder_segment_ids
=
None
,
...
...
@@ -1401,9 +1538,12 @@ class T5Transformer(Module):
Args:
encoder_input_tokens: input tokens to the encoder.
decoder_target_tokens: target tokens to the decoder.
encoder_dense_inputs: input dense vectors to the encoder.
encoder_dense_segment_ids: dense input segmentation info for packed
decoder_input_tokens: input tokens to the decoder, only required for
training.
encoder_segment_ids: input segmentation info for packed examples.
examples.
decoder_segment_ids: target segmentation info for packed examples.
training: whether it is training pass, affecting dropouts.
...
...
@@ -1411,15 +1551,19 @@ class T5Transformer(Module):
a dictionary of logits/cache.
"""
encoded
=
self
.
encode
(
encoder_input_tokens
,
encoder_input_tokens
=
encoder_input_tokens
,
encoder_segment_ids
=
encoder_segment_ids
,
encoder_dense_inputs
=
encoder_dense_inputs
,
encoder_dense_segment_ids
=
encoder_dense_segment_ids
,
training
=
training
)
outputs
=
self
.
decode
(
encoded
=
encoded
,
decoder_target_tokens
=
decoder_target_tokens
,
encoder_input_tokens
=
encoder_input_tokens
,
# only used for masks.
encoder_dense_inputs
=
encoder_dense_inputs
,
# only used for masks.
decoder_input_tokens
=
decoder_input_tokens
,
encoder_segment_ids
=
encoder_segment_ids
,
encoder_dense_segment_ids
=
encoder_dense_segment_ids
,
decoder_segment_ids
=
decoder_segment_ids
,
training
=
training
)
outputs
[
"encoded"
]
=
encoded
...
...
official/nlp/modeling/models/t5_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -354,6 +354,40 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
encoded
=
encoder
(
tf
.
zeros
((
4
,
8
),
dtype
=
tf
.
int32
))
self
.
assertEqual
(
encoded
.
shape
,
(
4
,
8
,
config
.
d_model
))
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_encoder_with_dense
(
self
,
dtype
):
config
=
t5
.
T5TransformerParams
(
num_layers
=
2
,
d_model
=
4
,
d_kv
=
3
,
num_heads
=
4
,
d_ff
=
16
,
vocab_size
=
10
,
vocab_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
relative_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
())
encoder
=
t5
.
Encoder
(
config
,
compute_dtype
=
dtype
)
encoded
=
encoder
(
tf
.
zeros
((
4
,
8
),
dtype
=
tf
.
int32
),
dense_inputs
=
tf
.
ones
((
4
,
2
,
4
),
dtype
=
dtype
))
self
.
assertEqual
(
encoded
.
shape
,
(
4
,
10
,
config
.
d_model
))
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_encoder_only_dense
(
self
,
dtype
):
config
=
t5
.
T5TransformerParams
(
num_layers
=
2
,
d_model
=
4
,
d_kv
=
3
,
num_heads
=
4
,
d_ff
=
16
,
vocab_size
=
10
,
vocab_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
relative_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
())
encoder
=
t5
.
Encoder
(
config
,
compute_dtype
=
dtype
)
encoded
=
encoder
(
dense_inputs
=
tf
.
ones
((
4
,
2
,
4
),
dtype
=
dtype
))
self
.
assertEqual
(
encoded
.
shape
,
(
4
,
2
,
config
.
d_model
))
def
test_decoder
(
self
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
...
...
@@ -369,7 +403,9 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
batch_size
=
4
targets
=
tf
.
zeros
((
4
,
8
),
dtype
=
tf
.
int32
)
encoded
=
tf
.
zeros
((
4
,
8
,
config
.
d_model
),
dtype
=
tf
.
float32
)
logits
,
cache
=
decoder
(
targets
,
encoded
)
outputs
=
decoder
(
targets
,
encoded
)
logits
=
outputs
[
"logits"
]
cache
=
outputs
[
"cache"
]
self
.
assertEqual
(
logits
.
shape
,
(
4
,
8
,
config
.
vocab_size
))
cache
=
{}
...
...
@@ -378,13 +414,15 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
cache
[
1
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
)
targets
=
tf
.
zeros
((
4
,
1
),
dtype
=
tf
.
int32
)
logits
,
cache
=
decoder
(
outputs
=
decoder
(
targets
,
encoded
,
decode_position
=
2
,
cache
=
cache
,
decode
=
True
,
max_decode_len
=
max_decode_len
)
logits
=
outputs
[
"logits"
]
cache
=
outputs
[
"cache"
]
self
.
assertEqual
(
logits
.
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
entry
in
cache
.
values
():
for
tensor
in
entry
.
values
():
...
...
@@ -445,6 +483,180 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
@
parameterized
.
named_parameters
(
(
"t5_10_dense"
,
(
"relu"
,),
True
,
26
,
False
,
tf
.
float32
),)
def
test_transformer_with_dense
(
self
,
ffn_activations
,
logits_via_embedding
,
expect_num_variables
,
layer_sharing
,
dtype
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_layers
=
1
,
d_model
=
8
,
d_kv
=
4
,
num_heads
=
4
,
d_ff
=
32
,
vocab_size
=
10
,
shared_embedding
=
True
,
layer_sharing
=
layer_sharing
,
ffn_activations
=
ffn_activations
,
logits_via_embedding
=
logits_via_embedding
)
transformer
=
t5
.
T5Transformer
(
config
,
compute_dtype
=
dtype
)
self
.
assertLen
(
transformer
.
trainable_variables
,
expect_num_variables
)
inputs
=
tf
.
convert_to_tensor
(
np
.
array
([[
2
,
2
,
1
,
3
,
1
,
0
],
[
3
,
3
,
1
,
2
,
2
,
1
]]))
segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
1
,
1
,
2
,
2
,
0
],
[
1
,
1
,
1
,
2
,
2
,
2
]]))
dense_inputs
=
tf
.
convert_to_tensor
(
np
.
random
.
randn
(
2
,
2
,
8
),
dtype
=
dtype
)
dense_segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
2
],
[
1
,
2
]]))
outputs
=
transformer
(
encoder_input_tokens
=
inputs
,
encoder_dense_inputs
=
dense_inputs
,
decoder_input_tokens
=
inputs
,
decoder_target_tokens
=
inputs
,
encoder_segment_ids
=
segments
,
encoder_dense_segment_ids
=
dense_segments
,
decoder_segment_ids
=
segments
)
cache
=
{}
batch_size
=
2
cache
[
0
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
,
dtype
=
dtype
)
outputs
=
transformer
.
decode
(
encoder_input_tokens
=
inputs
,
encoder_dense_inputs
=
dense_inputs
,
encoded
=
outputs
[
"encoded"
],
decoder_target_tokens
=
tf
.
ones
((
batch_size
,
1
),
dtype
=
tf
.
int32
),
decode_position
=
1
,
decode
=
True
,
max_decode_len
=
max_decode_len
,
cache
=
cache
)
self
.
assertEqual
(
outputs
[
"logits"
].
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
v
in
transformer
.
trainable_variables
:
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
@
parameterized
.
named_parameters
(
(
"t5_10_dense_layerwise_relpos"
,
(
"relu"
,),
True
,
26
,
False
,
tf
.
float32
,
False
,
1
),
(
"t5_10_dense_shared_relpos_d2"
,
(
"relu"
,),
True
,
39
,
False
,
tf
.
float32
,
True
,
2
),
(
"t5_10_dense_layerwise_relpos_d2"
,
(
"relu"
,),
True
,
40
,
False
,
tf
.
float32
,
False
,
2
),
)
def
test_transformer_with_lw_relpos
(
self
,
ffn_activations
,
logits_via_embedding
,
expect_num_variables
,
layer_sharing
,
dtype
,
use_shared_relpos
,
num_decoder_layers
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_layers
=
1
,
num_decoder_layers
=
num_decoder_layers
,
d_model
=
8
,
d_kv
=
4
,
num_heads
=
4
,
d_ff
=
32
,
vocab_size
=
10
,
shared_embedding
=
True
,
layer_sharing
=
layer_sharing
,
ffn_activations
=
ffn_activations
,
logits_via_embedding
=
logits_via_embedding
,
use_shared_relative_position_bias
=
use_shared_relpos
)
transformer
=
t5
.
T5Transformer
(
config
,
compute_dtype
=
dtype
)
self
.
assertLen
(
transformer
.
trainable_variables
,
expect_num_variables
)
inputs
=
tf
.
convert_to_tensor
(
np
.
array
([[
2
,
2
,
1
,
3
,
1
,
0
],
[
3
,
3
,
1
,
2
,
2
,
1
]]))
segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
1
,
1
,
2
,
2
,
0
],
[
1
,
1
,
1
,
2
,
2
,
2
]]))
dense_inputs
=
tf
.
convert_to_tensor
(
np
.
random
.
randn
(
2
,
2
,
8
),
dtype
=
dtype
)
dense_segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
2
],
[
1
,
2
]]))
outputs
=
transformer
(
encoder_input_tokens
=
inputs
,
encoder_dense_inputs
=
dense_inputs
,
decoder_input_tokens
=
inputs
,
decoder_target_tokens
=
inputs
,
encoder_segment_ids
=
segments
,
encoder_dense_segment_ids
=
dense_segments
,
decoder_segment_ids
=
segments
)
cache
=
{}
batch_size
=
2
for
i
in
range
(
num_decoder_layers
):
cache
[
i
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
,
dtype
=
dtype
)
outputs
=
transformer
.
decode
(
encoder_input_tokens
=
inputs
,
encoder_dense_inputs
=
dense_inputs
,
encoded
=
outputs
[
"encoded"
],
decoder_target_tokens
=
tf
.
ones
((
batch_size
,
1
),
dtype
=
tf
.
int32
),
decode_position
=
1
,
decode
=
True
,
max_decode_len
=
max_decode_len
,
cache
=
cache
)
self
.
assertEqual
(
outputs
[
"logits"
].
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
v
in
transformer
.
trainable_variables
:
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
@
parameterized
.
named_parameters
(
(
"t5_10"
,
(
"relu"
,),
True
,
26
,
False
,
tf
.
float32
),)
def
test_transformer_with_dense_only
(
self
,
ffn_activations
,
logits_via_embedding
,
expect_num_variables
,
layer_sharing
,
dtype
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_layers
=
1
,
d_model
=
8
,
d_kv
=
4
,
num_heads
=
4
,
d_ff
=
32
,
vocab_size
=
10
,
shared_embedding
=
True
,
layer_sharing
=
layer_sharing
,
ffn_activations
=
ffn_activations
,
logits_via_embedding
=
logits_via_embedding
)
transformer
=
t5
.
T5Transformer
(
config
,
compute_dtype
=
dtype
)
self
.
assertLen
(
transformer
.
trainable_variables
,
expect_num_variables
)
decoder_inputs
=
tf
.
convert_to_tensor
(
np
.
array
([[
2
,
2
,
1
,
3
,
1
,
0
],
[
3
,
3
,
1
,
2
,
2
,
1
]]))
decoder_segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
1
,
1
,
2
,
2
,
0
],
[
1
,
1
,
1
,
2
,
2
,
2
]]))
dense_inputs
=
tf
.
convert_to_tensor
(
np
.
random
.
randn
(
2
,
2
,
8
),
dtype
=
dtype
)
dense_segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
2
],
[
1
,
2
]]))
outputs
=
transformer
(
encoder_dense_inputs
=
dense_inputs
,
encoder_dense_segment_ids
=
dense_segments
,
decoder_input_tokens
=
decoder_inputs
,
decoder_target_tokens
=
decoder_inputs
,
decoder_segment_ids
=
decoder_segments
)
cache
=
{}
batch_size
=
2
cache
[
0
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
,
dtype
=
dtype
)
outputs
=
transformer
.
decode
(
encoder_dense_inputs
=
dense_inputs
,
encoded
=
outputs
[
"encoded"
],
decoder_target_tokens
=
tf
.
ones
((
batch_size
,
1
),
dtype
=
tf
.
int32
),
decode_position
=
1
,
decode
=
True
,
max_decode_len
=
max_decode_len
,
cache
=
cache
)
self
.
assertEqual
(
outputs
[
"logits"
].
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
v
in
transformer
.
trainable_variables
:
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
@
parameterized
.
named_parameters
(
(
"t5_10"
,
(
"relu"
,),
True
,
39
,
tf
.
float32
,
2
),
(
"t5_10_bfloat16"
,
(
"relu"
,),
True
,
39
,
tf
.
bfloat16
,
2
))
...
...
official/nlp/modeling/models/xlnet.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/models/xlnet_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
official/nlp/modeling/networks/README.md
View file @
32e4ca51
...
...
@@ -37,3 +37,8 @@ Generalized Autoregressive Pretraining for Language Understanding"
(https://arxiv.org/abs/1906.08237). It includes embedding lookups,
relative position encodings, mask computations, segment matrix computations and
Transformer XL layers using one or two stream relative self-attention.
*
[
`FNet`
](
fnet.py
)
implements the encoder model from
[
"FNet: Mixing Tokens with
Fourier Transforms"
](
https://aclanthology.org/2022.naacl-main.319/
)
. FNet has
the same structure as a Transformer encoder, except that all or most of the
self-attention sublayers are replaced with Fourier sublayers.
official/nlp/modeling/networks/__init__.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -23,6 +23,7 @@ from official.nlp.modeling.networks.bert_encoder import BertEncoder
from
official.nlp.modeling.networks.bert_encoder
import
BertEncoderV2
from
official.nlp.modeling.networks.classification
import
Classification
from
official.nlp.modeling.networks.encoder_scaffold
import
EncoderScaffold
from
official.nlp.modeling.networks.fnet
import
FNet
from
official.nlp.modeling.networks.funnel_transformer
import
FunnelTransformerEncoder
from
official.nlp.modeling.networks.mobile_bert_encoder
import
MobileBERTEncoder
from
official.nlp.modeling.networks.packed_sequence_embedding
import
PackedSequenceEmbedding
...
...
official/nlp/modeling/networks/albert_encoder.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -18,6 +18,7 @@ import collections
import
tensorflow
as
tf
from
official.modeling
import
activations
from
official.modeling
import
tf_utils
from
official.nlp.modeling
import
layers
...
...
@@ -92,13 +93,13 @@ class AlbertEncoder(tf.keras.Model):
embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
name
=
'word_embeddings'
)
word_embeddings
=
embedding_layer
(
word_ids
)
# Always uses dynamic slicing for simplicity.
position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
position_embeddings
=
position_embedding_layer
(
word_embeddings
)
...
...
@@ -107,7 +108,7 @@ class AlbertEncoder(tf.keras.Model):
layers
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
use_one_hot
=
True
,
name
=
'type_embeddings'
)(
type_ids
))
...
...
@@ -123,11 +124,11 @@ class AlbertEncoder(tf.keras.Model):
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if
embedding_width
!=
hidden_size
:
embeddings
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
embeddings
=
tf
.
keras
.
layers
.
EinsumDense
(
'...x,xy->...y'
,
output_shape
=
hidden_size
,
bias_axes
=
'y'
,
kernel_initializer
=
initializer
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
name
=
'embedding_projection'
)(
embeddings
)
...
...
@@ -139,7 +140,7 @@ class AlbertEncoder(tf.keras.Model):
inner_activation
=
activation
,
output_dropout
=
dropout_rate
,
attention_dropout
=
attention_dropout_rate
,
kernel_initializer
=
initializer
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
name
=
'transformer'
)
encoder_outputs
=
[]
for
_
in
range
(
num_layers
):
...
...
@@ -153,7 +154,7 @@ class AlbertEncoder(tf.keras.Model):
cls_output
=
tf
.
keras
.
layers
.
Dense
(
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
kernel_initializer
=
tf_utils
.
clone_initializer
(
initializer
)
,
name
=
'pooler_transform'
)(
first_token_tensor
)
if
dict_outputs
:
...
...
@@ -172,7 +173,7 @@ class AlbertEncoder(tf.keras.Model):
# created using the Functional API. Once super().__init__ is called, we
# can assign attributes to `self` - note that all `self` assignments are
# below this line.
super
(
AlbertEncoder
,
self
).
__init__
(
super
().
__init__
(
inputs
=
[
word_ids
,
mask
,
type_ids
],
outputs
=
outputs
,
**
kwargs
)
config_dict
=
{
'vocab_size'
:
vocab_size
,
...
...
official/nlp/modeling/networks/albert_encoder_test.py
View file @
32e4ca51
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
2
The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
Prev
1
…
19
20
21
22
23
24
25
26
27
…
39
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment