Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
ee222211
Commit
ee222211
authored
Oct 20, 2021
by
Hongkun Yu
Committed by
A. Unique TensorFlower
Oct 20, 2021
Browse files
Internal change
PiperOrigin-RevId: 404558260
parent
311099c1
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
1946 additions
and
0 deletions
+1946
-0
official/nlp/modeling/models/README.md
official/nlp/modeling/models/README.md
+9
-0
official/nlp/modeling/models/__init__.py
official/nlp/modeling/models/__init__.py
+2
-0
official/nlp/modeling/models/t5.py
official/nlp/modeling/models/t5.py
+1430
-0
official/nlp/modeling/models/t5_test.py
official/nlp/modeling/models/t5_test.py
+505
-0
No files found.
official/nlp/modeling/models/README.md
View file @
ee222211
...
...
@@ -23,3 +23,12 @@ respectively.
*
[
`DualEncoder`
](
dual_encoder.py
)
implements a dual encoder model, suitbale for
retrieval tasks.
*
[
`Seq2SeqTransformer`
](
seq2seq_transformer.py
)
implements the original
Transformer model for seq-to-seq tasks.
*
[
`T5Transformer`
](
t5.py
)
implements a standalone T5 model for seq-to-seq
tasks. The models are compatible with released T5 architecture and converted
checkpoints. The modules are implemented as
`tf.Module`
. To use with Keras,
users can wrap them within Keras customized layers, i.e. we can define the
modules inside the
`__init__`
of Keras layer and call the modules in
`call`
.
official/nlp/modeling/models/__init__.py
View file @
ee222211
...
...
@@ -24,6 +24,8 @@ from official.nlp.modeling.models.bert_token_classifier import BertTokenClassifi
from
official.nlp.modeling.models.dual_encoder
import
DualEncoder
from
official.nlp.modeling.models.electra_pretrainer
import
ElectraPretrainer
from
official.nlp.modeling.models.seq2seq_transformer
import
*
from
official.nlp.modeling.models.t5
import
T5Transformer
from
official.nlp.modeling.models.t5
import
T5TransformerParams
from
official.nlp.modeling.models.xlnet
import
XLNetClassifier
from
official.nlp.modeling.models.xlnet
import
XLNetPretrainer
from
official.nlp.modeling.models.xlnet
import
XLNetSpanLabeler
official/nlp/modeling/models/t5.py
0 → 100644
View file @
ee222211
This diff is collapsed.
Click to expand it.
official/nlp/modeling/models/t5_test.py
0 → 100644
View file @
ee222211
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for t5."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.distribute
import
strategy_combinations
from
official.nlp.modeling.models
import
t5
def
_create_cache
(
batch_size
,
init_decode_length
,
num_heads
,
head_size
,
dtype
=
tf
.
float32
):
if
num_heads
is
None
:
kv_shape
=
[
batch_size
,
init_decode_length
,
head_size
]
else
:
kv_shape
=
[
batch_size
,
init_decode_length
,
num_heads
,
head_size
]
return
{
"key"
:
tf
.
zeros
(
kv_shape
,
dtype
=
dtype
),
"value"
:
tf
.
zeros
(
kv_shape
,
dtype
=
dtype
)
}
class
ModulesTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_embed
(
self
,
dtype
):
l
=
t5
.
Embed
(
vocab_size
=
5
,
features
=
4
,
compute_dtype
=
dtype
,
name
=
"foo"
)
inputs
=
np
.
array
([[
2
,
3
],
[
1
,
2
]],
dtype
=
np
.
int32
)
inputs
=
tf
.
convert_to_tensor
(
inputs
)
one_hot_outputs
=
l
(
inputs
,
one_hot
=
True
)
gather_outputs
=
l
(
inputs
,
one_hot
=
False
)
self
.
assertEqual
(
one_hot_outputs
.
shape
,
(
2
,
2
,
4
))
self
.
assertLen
(
l
.
trainable_variables
,
1
)
self
.
assertAllClose
(
one_hot_outputs
,
gather_outputs
)
outputs
=
l
.
attend
(
query
=
tf
.
zeros
((
2
,
2
,
4
),
dtype
))
self
.
assertEqual
(
outputs
.
shape
,
(
2
,
2
,
5
))
# Test initializers.
l
=
t5
.
Embed
(
vocab_size
=
5
,
features
=
4
,
compute_dtype
=
dtype
,
name
=
"foo"
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Zeros
())
self
.
assertAllClose
(
l
(
inputs
),
tf
.
zeros
((
2
,
2
,
4
),
dtype
))
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_rms_norm
(
self
,
dtype
):
l
=
t5
.
RMSNorm
(
hidden_size
=
4
,
epsilon
=
0.0
,
name
=
"foo"
)
inputs
=
tf
.
ones
((
2
,
4
),
dtype
=
dtype
)
outputs
=
l
(
inputs
)
self
.
assertAllEqual
(
l
(
inputs
),
inputs
)
self
.
assertEqual
(
outputs
.
dtype
,
dtype
)
self
.
assertLen
(
l
.
trainable_variables
,
1
)
self
.
assertIn
(
"foo/scale"
,
l
.
trainable_variables
[
0
].
name
)
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_linear
(
self
,
dtype
):
l
=
t5
.
Linear
(
in_features
=
4
,
out_features
=
4
,
w_init
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"foo"
)
inputs
=
tf
.
ones
((
2
,
4
),
dtype
=
dtype
)
outputs
=
l
(
inputs
)
self
.
assertEqual
(
outputs
.
shape
,
inputs
.
shape
)
self
.
assertEqual
(
outputs
.
dtype
,
dtype
)
self
.
assertLen
(
l
.
trainable_variables
,
2
)
def
test_linear3d
(
self
):
batch_size
=
2
l
=
t5
.
Linear3D
(
in_features
=
4
,
out_features
=
4
,
num_heads
=
2
,
to_3d
=
True
,
w_init
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"foo"
)
inputs
=
np
.
ones
((
batch_size
,
2
,
4
),
dtype
=
np
.
float32
)
self
.
assertEqual
(
l
(
inputs
).
shape
,
(
batch_size
,
2
,
2
,
4
))
l
=
t5
.
Linear3D
(
in_features
=
2
,
out_features
=
4
,
num_heads
=
2
,
to_3d
=
False
,
w_init
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"foo"
)
inputs
=
np
.
ones
((
batch_size
,
2
,
2
,
2
),
dtype
=
np
.
float32
)
self
.
assertEqual
(
l
(
inputs
).
shape
,
(
batch_size
,
2
,
4
))
def
test_ffn
(
self
):
inputs
=
np
.
ones
((
2
,
4
),
dtype
=
np
.
float32
)
for
activation
in
[
"relu"
,
"linear"
,
"gelu"
,
"swish"
]:
l
=
t5
.
FFN
(
d_model
=
4
,
d_ff
=
8
,
use_bias
=
True
,
dropout_rate
=
0.1
,
activations
=
[
activation
],
name
=
"foo"
)
self
.
assertEqual
(
l
(
inputs
).
shape
,
inputs
.
shape
)
self
.
assertLen
(
l
.
trainable_variables
,
4
)
l
=
t5
.
FFN
(
d_model
=
4
,
d_ff
=
8
,
dropout_rate
=
0.1
,
activations
=
[
"linear"
,
"gelu"
],
name
=
"bar"
)
self
.
assertLen
(
l
.
trainable_variables
,
3
)
self
.
assertEqual
(
l
(
inputs
).
shape
,
inputs
.
shape
)
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_relative_position
(
self
,
dtype
):
l
=
t5
.
RelativePositionEmbedding
(
num_heads
=
4
,
bidirectional
=
False
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
compute_dtype
=
dtype
,
name
=
"foo"
)
self
.
assertEqual
(
l
(
4
,
2
).
shape
,
(
1
,
4
,
4
,
2
))
l
=
t5
.
RelativePositionEmbedding
(
num_heads
=
4
,
bidirectional
=
True
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
compute_dtype
=
dtype
,
name
=
"bar"
)
outputs
=
l
(
4
,
2
)
self
.
assertEqual
(
outputs
.
shape
,
(
1
,
4
,
4
,
2
))
self
.
assertEqual
(
outputs
.
dtype
,
dtype
)
def
test_masks
(
self
):
causal_mask
=
t5
.
make_causal_mask
(
np
.
zeros
((
2
,
5
)))
self
.
assertEqual
(
causal_mask
.
shape
,
(
2
,
1
,
5
,
5
))
@
combinations
.
generate
(
combinations
.
combine
(
distribution
=
[
strategy_combinations
.
default_strategy
,
strategy_combinations
.
cloud_tpu_strategy
,
],
mode
=
"eager"
))
def
test_attention
(
self
,
distribution
):
num_heads
,
head_size
=
2
,
4
from_seq_length
,
to_seq_length
=
4
,
6
batch_size
=
2
pos_embed
=
t5
.
RelativePositionEmbedding
(
num_heads
=
4
,
bidirectional
=
False
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"pos_embed"
)
position_bias
=
pos_embed
(
from_seq_length
,
from_seq_length
)
l
=
t5
.
MultiHeadAttention
(
d_model
=
4
,
d_kv
=
2
,
num_heads
=
4
,
dropout_rate
=
0.1
)
query
=
tf
.
convert_to_tensor
(
np
.
ones
((
batch_size
,
from_seq_length
,
4
),
dtype
=
np
.
float32
))
self
.
assertEqual
(
l
(
query
,
position_bias
=
position_bias
)[
"context"
].
shape
,
query
.
shape
)
kv
=
tf
.
convert_to_tensor
(
np
.
ones
((
batch_size
,
to_seq_length
,
4
),
dtype
=
np
.
float32
))
position_bias
=
pos_embed
(
from_seq_length
,
to_seq_length
)
outputs
=
l
(
query
,
kv
=
kv
,
position_bias
=
position_bias
)
self
.
assertEqual
(
outputs
[
"context"
].
shape
,
query
.
shape
)
with
distribution
.
scope
():
l
=
t5
.
MultiHeadAttention
(
d_model
=
4
,
d_kv
=
head_size
,
num_heads
=
num_heads
,
dropout_rate
=
0.1
)
@
tf
.
function
def
step
(
inputs
):
def
_step_fn
(
inputs
):
cache
=
_create_cache
(
batch_size
,
from_seq_length
,
num_heads
,
head_size
)
mask
=
t5
.
make_causal_mask
(
tf
.
ones
((
batch_size
,
1
)))
return
l
(
query
=
inputs
,
mask
=
mask
,
cache
=
cache
,
decode_position
=
decode_position
)
outputs
=
distribution
.
run
(
_step_fn
,
args
=
(
inputs
,))
return
tf
.
nest
.
map_structure
(
distribution
.
experimental_local_results
,
outputs
)
decode_position
=
2
query
=
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
1
,
4
),
dtype
=
np
.
float32
))
local_outputs
=
step
(
query
)
self
.
assertEqual
(
local_outputs
[
"context"
][
0
].
shape
,
(
2
,
1
,
4
))
self
.
assertNotEqual
(
np
.
sum
(
local_outputs
[
"cache"
][
"key"
][
0
][:,
decode_position
,
...].
numpy
()),
0.0
)
class
T5Test
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
distribution
=
[
strategy_combinations
.
default_strategy
,
strategy_combinations
.
cloud_tpu_strategy
,
],
mode
=
"eager"
))
def
test_attention_layers
(
self
,
distribution
):
num_heads
,
head_size
=
2
,
2
from_seq_length
=
4
# TPU decoding should pre-allocate the entire sequence.
batch_size
=
2
with
distribution
.
scope
():
pos_embed
=
t5
.
RelativePositionEmbedding
(
num_heads
=
head_size
,
bidirectional
=
False
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"pos_embed"
)
l
=
t5
.
SelfAttention
(
d_model
=
4
,
d_kv
=
head_size
,
num_heads
=
num_heads
,
dropout_rate
=
0.1
)
decode_position
=
2
@
tf
.
function
def
step
(
inputs
):
def
_step_fn
(
inputs
):
cache
=
_create_cache
(
batch_size
,
from_seq_length
,
num_heads
,
head_size
)
mask
=
t5
.
make_causal_mask
(
tf
.
ones
((
batch_size
,
1
)))
position_bias
=
pos_embed
(
from_seq_length
,
from_seq_length
)
return
l
(
hidden_states
=
inputs
,
cache
=
cache
,
attention_mask
=
mask
,
decode_position
=
decode_position
,
position_bias
=
position_bias
)
outputs
=
distribution
.
run
(
_step_fn
,
args
=
(
inputs
,))
return
tf
.
nest
.
map_structure
(
distribution
.
experimental_local_results
,
outputs
)
query
=
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
1
,
4
),
dtype
=
np
.
float32
))
local_outputs
=
step
(
query
)
self
.
assertEqual
(
local_outputs
[
"layer_output"
][
0
].
shape
,
(
2
,
1
,
4
))
self
.
assertNotEqual
(
np
.
sum
(
local_outputs
[
"cache"
][
"key"
][
0
][:,
decode_position
,
:,
:].
numpy
()),
0.0
)
l
=
t5
.
CrossAttention
(
d_model
=
4
,
d_kv
=
head_size
,
num_heads
=
num_heads
,
dropout_rate
=
0.1
)
to_seq_length
=
6
query
=
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
from_seq_length
,
4
),
dtype
=
np
.
float32
))
kv
=
tf
.
convert_to_tensor
(
np
.
ones
((
2
,
to_seq_length
,
4
),
dtype
=
np
.
float32
))
@
tf
.
function
def
step_cross_attn
(
inputs
):
def
_step_fn
(
inputs
):
query
,
kv
=
inputs
mask
=
t5
.
make_attention_mask
(
tf
.
ones
((
batch_size
,
from_seq_length
)),
tf
.
ones
((
batch_size
,
to_seq_length
)))
return
l
(
hidden_states
=
query
,
kv
=
kv
,
attention_mask
=
mask
)
outputs
=
distribution
.
run
(
_step_fn
,
args
=
(
inputs
,))
return
tf
.
nest
.
map_structure
(
distribution
.
experimental_local_results
,
outputs
)
local_outputs
=
step_cross_attn
((
query
,
kv
))
self
.
assertEqual
(
local_outputs
[
"layer_output"
][
0
].
shape
,
(
2
,
from_seq_length
,
4
))
def
test_encoder_block
(
self
):
batch_size
=
2
from_seq_length
=
5
d_model
=
4
l
=
t5
.
EncoderBlock
(
d_model
=
4
,
d_kv
=
3
,
num_heads
=
2
,
d_ff
=
8
,
name
=
"foo"
)
pos_embed
=
t5
.
RelativePositionEmbedding
(
num_heads
=
2
,
bidirectional
=
True
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"bar"
)
attention_mask
=
t5
.
make_attention_mask
(
tf
.
ones
((
batch_size
,
from_seq_length
)),
tf
.
ones
((
batch_size
,
from_seq_length
)))
position_bias
=
pos_embed
(
from_seq_length
,
from_seq_length
)
inputs
=
tf
.
ones
((
batch_size
,
from_seq_length
,
d_model
),
dtype
=
tf
.
float32
)
outputs
=
l
(
inputs
,
attention_mask
=
attention_mask
,
position_bias
=
position_bias
)
self
.
assertEqual
(
outputs
.
shape
,
(
batch_size
,
from_seq_length
,
d_model
))
def
test_encdec_block
(
self
):
batch_size
=
2
from_seq_length
=
5
to_seq_length
=
3
d_model
=
4
l
=
t5
.
EncDecoderBlock
(
d_model
=
4
,
d_kv
=
3
,
num_heads
=
2
,
d_ff
=
8
,
name
=
"foo"
)
pos_embed
=
t5
.
RelativePositionEmbedding
(
num_heads
=
2
,
bidirectional
=
True
,
embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
name
=
"bar"
)
encoder_decoder_mask
=
t5
.
make_attention_mask
(
tf
.
ones
((
batch_size
,
from_seq_length
)),
tf
.
ones
((
batch_size
,
to_seq_length
)))
position_bias
=
pos_embed
(
from_seq_length
,
from_seq_length
)
inputs
=
tf
.
ones
((
batch_size
,
from_seq_length
,
d_model
),
dtype
=
tf
.
float32
)
encoder_hidden_states
=
tf
.
ones
((
batch_size
,
to_seq_length
,
d_model
),
dtype
=
tf
.
float32
)
outputs
=
l
(
inputs
,
encoder_hidden_states
,
encoder_decoder_mask
=
encoder_decoder_mask
,
position_bias
=
position_bias
)
self
.
assertEqual
(
outputs
[
0
].
shape
,
(
batch_size
,
from_seq_length
,
d_model
))
@
parameterized
.
named_parameters
((
"bfloat16"
,
tf
.
bfloat16
),
(
"float32"
,
tf
.
float32
))
def
test_encoder
(
self
,
dtype
):
config
=
t5
.
T5TransformerParams
(
num_layers
=
2
,
d_model
=
4
,
d_kv
=
3
,
num_heads
=
4
,
d_ff
=
16
,
vocab_size
=
10
,
vocab_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
relative_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
())
encoder
=
t5
.
Encoder
(
config
,
compute_dtype
=
dtype
)
encoded
=
encoder
(
tf
.
zeros
((
4
,
8
),
dtype
=
tf
.
int32
))
self
.
assertEqual
(
encoded
.
shape
,
(
4
,
8
,
config
.
d_model
))
def
test_decoder
(
self
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_layers
=
2
,
d_model
=
4
,
d_kv
=
3
,
num_heads
=
4
,
d_ff
=
16
,
vocab_size
=
10
,
vocab_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
(),
relative_embeddings_initializer
=
tf
.
keras
.
initializers
.
Ones
())
decoder
=
t5
.
Decoder
(
config
)
batch_size
=
4
targets
=
tf
.
zeros
((
4
,
8
),
dtype
=
tf
.
int32
)
encoded
=
tf
.
zeros
((
4
,
8
,
config
.
d_model
),
dtype
=
tf
.
float32
)
logits
,
cache
=
decoder
(
targets
,
encoded
)
self
.
assertEqual
(
logits
.
shape
,
(
4
,
8
,
config
.
vocab_size
))
cache
=
{}
cache
[
0
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
)
cache
[
1
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
)
targets
=
tf
.
zeros
((
4
,
1
),
dtype
=
tf
.
int32
)
logits
,
cache
=
decoder
(
targets
,
encoded
,
decode_position
=
2
,
cache
=
cache
,
decode
=
True
,
max_decode_len
=
max_decode_len
)
self
.
assertEqual
(
logits
.
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
entry
in
cache
.
values
():
for
tensor
in
entry
.
values
():
self
.
assertNotAllEqual
(
tensor
.
numpy
()[:,
2
,
:,
:],
0.0
)
@
parameterized
.
named_parameters
(
(
"t5_10"
,
(
"relu"
,),
True
,
26
,
False
,
tf
.
float32
),
(
"t5_11"
,
(
"gelu"
,
"linear"
),
False
,
29
,
False
,
tf
.
float32
),
(
"t5_10_bfloat16"
,
(
"relu"
,),
True
,
26
,
False
,
tf
.
bfloat16
),
(
"t5_11_bfloat16"
,
(
"gelu"
,
"linear"
),
False
,
29
,
False
,
tf
.
bfloat16
),
(
"t5_10_layer_sharing"
,
(
"relu"
,),
True
,
26
,
True
,
tf
.
float32
),
(
"t5_11_layer_sharing"
,
(
"gelu"
,
"linear"
),
False
,
29
,
True
,
tf
.
float32
),
(
"t5_10_bfloat16_layer_sharing"
,
(
"relu"
,),
True
,
26
,
True
,
tf
.
bfloat16
),
(
"t5_11_bfloat16_layer_sharing"
,
(
"gelu"
,
"linear"
),
False
,
29
,
True
,
tf
.
bfloat16
))
def
test_transformer
(
self
,
ffn_activations
,
logits_via_embedding
,
expect_num_variables
,
layer_sharing
,
dtype
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_layers
=
1
,
d_model
=
8
,
d_kv
=
4
,
num_heads
=
4
,
d_ff
=
32
,
vocab_size
=
10
,
shared_embedding
=
True
,
layer_sharing
=
layer_sharing
,
ffn_activations
=
ffn_activations
,
logits_via_embedding
=
logits_via_embedding
)
transformer
=
t5
.
T5Transformer
(
config
,
compute_dtype
=
dtype
)
self
.
assertLen
(
transformer
.
trainable_variables
,
expect_num_variables
)
inputs
=
tf
.
convert_to_tensor
(
np
.
array
([[
2
,
2
,
1
,
3
,
1
,
0
],
[
3
,
3
,
1
,
2
,
2
,
1
]]))
segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
1
,
1
,
2
,
2
,
0
],
[
1
,
1
,
1
,
2
,
2
,
2
]]))
outputs
=
transformer
(
encoder_input_tokens
=
inputs
,
decoder_input_tokens
=
inputs
,
decoder_target_tokens
=
inputs
,
encoder_segment_ids
=
segments
,
decoder_segment_ids
=
segments
)
cache
=
{}
batch_size
=
2
cache
[
0
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
,
dtype
=
dtype
)
outputs
=
transformer
.
decode
(
encoder_input_tokens
=
inputs
,
encoded
=
outputs
[
"encoded"
],
decoder_target_tokens
=
tf
.
ones
((
batch_size
,
1
),
dtype
=
tf
.
int32
),
decode_position
=
1
,
decode
=
True
,
max_decode_len
=
max_decode_len
,
cache
=
cache
)
self
.
assertEqual
(
outputs
[
"logits"
].
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
v
in
transformer
.
trainable_variables
:
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
@
parameterized
.
named_parameters
(
(
"t5_10"
,
(
"relu"
,),
True
,
39
,
tf
.
float32
,
2
),
(
"t5_10_bfloat16"
,
(
"relu"
,),
True
,
39
,
tf
.
bfloat16
,
2
))
def
test_transformer_different_num_decoder_layers
(
self
,
ffn_activations
,
logits_via_embedding
,
expect_num_variables
,
dtype
,
num_decoder_layers
):
max_decode_len
=
10
config
=
t5
.
T5TransformerParams
(
num_decoder_layers
=
num_decoder_layers
,
num_layers
=
1
,
d_model
=
8
,
d_kv
=
4
,
num_heads
=
4
,
d_ff
=
32
,
vocab_size
=
10
,
shared_embedding
=
True
,
ffn_activations
=
ffn_activations
,
logits_via_embedding
=
logits_via_embedding
)
transformer
=
t5
.
T5Transformer
(
config
,
compute_dtype
=
dtype
)
self
.
assertLen
(
transformer
.
trainable_variables
,
expect_num_variables
)
inputs
=
tf
.
convert_to_tensor
(
np
.
array
([[
2
,
2
,
1
,
3
,
1
,
0
],
[
3
,
3
,
1
,
2
,
2
,
1
]]))
segments
=
tf
.
convert_to_tensor
(
np
.
array
([[
1
,
1
,
1
,
2
,
2
,
0
],
[
1
,
1
,
1
,
2
,
2
,
2
]]))
outputs
=
transformer
(
encoder_input_tokens
=
inputs
,
decoder_input_tokens
=
inputs
,
decoder_target_tokens
=
inputs
,
encoder_segment_ids
=
segments
,
decoder_segment_ids
=
segments
)
cache
=
{}
batch_size
=
2
for
i
in
range
(
num_decoder_layers
):
cache
[
i
]
=
_create_cache
(
batch_size
,
max_decode_len
,
config
.
num_heads
,
config
.
d_kv
,
dtype
=
dtype
)
outputs
=
transformer
.
decode
(
encoder_input_tokens
=
inputs
,
encoded
=
outputs
[
"encoded"
],
decoder_target_tokens
=
tf
.
ones
((
batch_size
,
1
),
dtype
=
tf
.
int32
),
decode_position
=
1
,
decode
=
True
,
max_decode_len
=
max_decode_len
,
cache
=
cache
)
self
.
assertEqual
(
outputs
[
"logits"
].
shape
,
(
batch_size
,
1
,
config
.
vocab_size
))
for
v
in
transformer
.
trainable_variables
:
print
(
v
.
name
,
v
.
shape
)
self
.
assertEqual
(
v
.
dtype
,
tf
.
float32
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment