Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
47bc1813
Commit
47bc1813
authored
Jul 01, 2020
by
syiming
Browse files
Merge remote-tracking branch 'upstream/master' into add_multilevel_crop_and_resize
parents
d8611151
b035a227
Changes
329
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1590 additions
and
333 deletions
+1590
-333
official/nlp/configs/bert_test.py
official/nlp/configs/bert_test.py
+4
-4
official/nlp/configs/encoders.py
official/nlp/configs/encoders.py
+28
-2
official/nlp/data/classifier_data_lib.py
official/nlp/data/classifier_data_lib.py
+266
-51
official/nlp/data/create_finetuning_data.py
official/nlp/data/create_finetuning_data.py
+96
-7
official/nlp/data/sentence_retrieval_lib.py
official/nlp/data/sentence_retrieval_lib.py
+168
-0
official/nlp/data/tagging_data_lib.py
official/nlp/data/tagging_data_lib.py
+346
-0
official/nlp/data/tagging_data_loader.py
official/nlp/data/tagging_data_loader.py
+64
-0
official/nlp/modeling/layers/README.md
official/nlp/modeling/layers/README.md
+12
-1
official/nlp/modeling/layers/__init__.py
official/nlp/modeling/layers/__init__.py
+3
-1
official/nlp/modeling/layers/masked_lm.py
official/nlp/modeling/layers/masked_lm.py
+124
-0
official/nlp/modeling/layers/masked_lm_test.py
official/nlp/modeling/layers/masked_lm_test.py
+162
-0
official/nlp/modeling/layers/multi_channel_attention.py
official/nlp/modeling/layers/multi_channel_attention.py
+34
-15
official/nlp/modeling/layers/multi_channel_attention_test.py
official/nlp/modeling/layers/multi_channel_attention_test.py
+3
-2
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+199
-0
official/nlp/modeling/layers/transformer_test.py
official/nlp/modeling/layers/transformer_test.py
+34
-0
official/nlp/modeling/losses/README.md
official/nlp/modeling/losses/README.md
+0
-3
official/nlp/modeling/losses/__init__.py
official/nlp/modeling/losses/__init__.py
+0
-1
official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
...deling/losses/weighted_sparse_categorical_crossentropy.py
+9
-39
official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
...g/losses/weighted_sparse_categorical_crossentropy_test.py
+16
-189
official/nlp/modeling/models/bert_pretrainer.py
official/nlp/modeling/models/bert_pretrainer.py
+22
-18
No files found.
Too many changes to show.
To preserve performance only
329 of 329+
files are displayed.
Plain diff
Email patch
official/nlp/configs/bert_test.py
View file @
47bc1813
...
...
@@ -26,7 +26,7 @@ class BertModelsTest(tf.test.TestCase):
def
test_network_invocation
(
self
):
config
=
bert
.
BertPretrainerConfig
(
encoder
=
encoders
.
TransformerEncoderConfig
(
vocab_size
=
10
,
num_layers
=
1
))
_
=
bert
.
instantiate_from_cfg
(
config
)
_
=
bert
.
instantiate_
bertpretrainer_
from_cfg
(
config
)
# Invokes with classification heads.
config
=
bert
.
BertPretrainerConfig
(
...
...
@@ -35,7 +35,7 @@ class BertModelsTest(tf.test.TestCase):
bert
.
ClsHeadConfig
(
inner_dim
=
10
,
num_classes
=
2
,
name
=
"next_sentence"
)
])
_
=
bert
.
instantiate_from_cfg
(
config
)
_
=
bert
.
instantiate_
bertpretrainer_
from_cfg
(
config
)
with
self
.
assertRaises
(
ValueError
):
config
=
bert
.
BertPretrainerConfig
(
...
...
@@ -47,7 +47,7 @@ class BertModelsTest(tf.test.TestCase):
bert
.
ClsHeadConfig
(
inner_dim
=
10
,
num_classes
=
2
,
name
=
"next_sentence"
)
])
_
=
bert
.
instantiate_from_cfg
(
config
)
_
=
bert
.
instantiate_
bertpretrainer_
from_cfg
(
config
)
def
test_checkpoint_items
(
self
):
config
=
bert
.
BertPretrainerConfig
(
...
...
@@ -56,7 +56,7 @@ class BertModelsTest(tf.test.TestCase):
bert
.
ClsHeadConfig
(
inner_dim
=
10
,
num_classes
=
2
,
name
=
"next_sentence"
)
])
encoder
=
bert
.
instantiate_from_cfg
(
config
)
encoder
=
bert
.
instantiate_
bertpretrainer_
from_cfg
(
config
)
self
.
assertSameElements
(
encoder
.
checkpoint_items
.
keys
(),
[
"encoder"
,
"next_sentence.pooler_dense"
])
...
...
official/nlp/configs/encoders.py
View file @
47bc1813
...
...
@@ -13,11 +13,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Configurations for Encoders."""
"""Transformer Encoders.
Includes configurations and instantiation methods.
"""
import
dataclasses
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling.hyperparams
import
base_config
from
official.nlp.modeling
import
networks
@
dataclasses
.
dataclass
...
...
@@ -28,9 +34,29 @@ class TransformerEncoderConfig(base_config.Config):
num_layers
:
int
=
12
num_attention_heads
:
int
=
12
hidden_activation
:
str
=
"gelu"
intermediate_size
:
int
=
307
6
intermediate_size
:
int
=
307
2
dropout_rate
:
float
=
0.1
attention_dropout_rate
:
float
=
0.1
max_position_embeddings
:
int
=
512
type_vocab_size
:
int
=
2
initializer_range
:
float
=
0.02
def
instantiate_encoder_from_cfg
(
config
:
TransformerEncoderConfig
)
->
networks
.
TransformerEncoder
:
"""Instantiate a Transformer encoder network from TransformerEncoderConfig."""
encoder_network
=
networks
.
TransformerEncoder
(
vocab_size
=
config
.
vocab_size
,
hidden_size
=
config
.
hidden_size
,
num_layers
=
config
.
num_layers
,
num_attention_heads
=
config
.
num_attention_heads
,
intermediate_size
=
config
.
intermediate_size
,
activation
=
tf_utils
.
get_activation
(
config
.
hidden_activation
),
dropout_rate
=
config
.
dropout_rate
,
attention_dropout_rate
=
config
.
attention_dropout_rate
,
sequence_length
=
None
,
max_sequence_length
=
config
.
max_position_embeddings
,
type_vocab_size
=
config
.
type_vocab_size
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
config
.
initializer_range
))
return
encoder_network
official/nlp/data/classifier_data_lib.py
View file @
47bc1813
...
...
@@ -33,7 +33,13 @@ from official.nlp.bert import tokenization
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
,
weight
=
None
):
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
,
weight
=
None
,
int_iden
=
None
):
"""Constructs a InputExample.
Args:
...
...
@@ -46,12 +52,15 @@ class InputExample(object):
specified for train and dev examples, but not for test examples.
weight: (Optional) float. The weight of the example to be used during
training.
int_iden: (Optional) int. The int identification number of example in the
corpus.
"""
self
.
guid
=
guid
self
.
text_a
=
text_a
self
.
text_b
=
text_b
self
.
label
=
label
self
.
weight
=
weight
self
.
int_iden
=
int_iden
class
InputFeatures
(
object
):
...
...
@@ -63,13 +72,15 @@ class InputFeatures(object):
segment_ids
,
label_id
,
is_real_example
=
True
,
weight
=
None
):
weight
=
None
,
int_iden
=
None
):
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
label_id
=
label_id
self
.
is_real_example
=
is_real_example
self
.
weight
=
weight
self
.
int_iden
=
int_iden
class
DataProcessor
(
object
):
...
...
@@ -191,12 +202,68 @@ class XnliProcessor(DataProcessor):
return
"XNLI"
class
Pawsx
Processor
(
DataProcessor
):
"""Processor for the
PAWS-X
data set."""
class
XtremeXnli
Processor
(
DataProcessor
):
"""Processor for the
XTREME XNLI
data set."""
supported_languages
=
[
"de"
,
"en"
,
"es"
,
"fr"
,
"ja"
,
"ko"
,
"zh"
"ar"
,
"bg"
,
"de"
,
"el"
,
"en"
,
"es"
,
"fr"
,
"hi"
,
"ru"
,
"sw"
,
"th"
,
"tr"
,
"ur"
,
"vi"
,
"zh"
]
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"train-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
self
.
process_text_fn
(
line
[
2
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"dev-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
self
.
process_text_fn
(
line
[
2
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
examples_by_lang
=
{
k
:
[]
for
k
in
self
.
supported_languages
}
for
lang
in
self
.
supported_languages
:
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
f
"test-
{
lang
}
.tsv"
))
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
f
"test-
{
i
}
"
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
"contradiction"
examples_by_lang
[
lang
].
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples_by_lang
def
get_labels
(
self
):
"""See base class."""
return
[
"contradiction"
,
"entailment"
,
"neutral"
]
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"XTREME-XNLI"
class
PawsxProcessor
(
DataProcessor
):
"""Processor for the PAWS-X data set."""
supported_languages
=
[
"de"
,
"en"
,
"es"
,
"fr"
,
"ja"
,
"ko"
,
"zh"
]
def
__init__
(
self
,
language
=
"en"
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
...
...
@@ -219,8 +286,7 @@ class PawsxProcessor(DataProcessor):
train_tsv
=
"translated_train.tsv"
# Skips the header.
lines
.
extend
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
language
,
train_tsv
))[
1
:])
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
language
,
train_tsv
))[
1
:])
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
...
...
@@ -235,10 +301,9 @@ class PawsxProcessor(DataProcessor):
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
lines
=
[]
for
language
in
PawsxProcessor
.
supported_languages
:
# Skips the header.
for
lang
in
PawsxProcessor
.
supported_languages
:
lines
.
extend
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
lang
uage
,
"dev_2k.tsv"
))[
1
:])
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
lang
,
"dev_2k.tsv"
))[
1
:])
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
...
...
@@ -252,17 +317,15 @@ class PawsxProcessor(DataProcessor):
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
examples_by_lang
=
{
k
:
[]
for
k
in
PawsxProcessor
.
supported_languages
}
for
lang
uage
in
PawsxProcessor
.
supported_languages
:
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
lang
uage
,
"test_2k.tsv"
))
examples_by_lang
=
{
k
:
[]
for
k
in
self
.
supported_languages
}
for
lang
in
self
.
supported_languages
:
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
lang
,
"test_2k.tsv"
))
[
1
:]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"test-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
1
])
text_b
=
self
.
process_text_fn
(
line
[
2
])
label
=
self
.
process_text_fn
(
line
[
3
])
examples_by_lang
[
lang
uage
].
append
(
examples_by_lang
[
lang
].
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples_by_lang
...
...
@@ -273,7 +336,62 @@ class PawsxProcessor(DataProcessor):
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"PAWS-X"
return
"XTREME-PAWS-X"
class
XtremePawsxProcessor
(
DataProcessor
):
"""Processor for the XTREME PAWS-X data set."""
supported_languages
=
[
"de"
,
"en"
,
"es"
,
"fr"
,
"ja"
,
"ko"
,
"zh"
]
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"train-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
self
.
process_text_fn
(
line
[
2
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
))
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"dev-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
self
.
process_text_fn
(
line
[
2
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
examples_by_lang
=
{
k
:
[]
for
k
in
self
.
supported_languages
}
for
lang
in
self
.
supported_languages
:
lines
=
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
f
"test-
{
lang
}
.tsv"
))
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"test-%d"
%
i
text_a
=
self
.
process_text_fn
(
line
[
0
])
text_b
=
self
.
process_text_fn
(
line
[
1
])
label
=
"0"
examples_by_lang
[
lang
].
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples_by_lang
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"XTREME-PAWS-X"
class
MnliProcessor
(
DataProcessor
):
...
...
@@ -407,8 +525,8 @@ class QqpProcessor(DataProcessor):
label
=
line
[
5
]
except
IndexError
:
continue
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
...
...
@@ -458,6 +576,53 @@ class ColaProcessor(DataProcessor):
return
examples
class
RteProcessor
(
DataProcessor
):
"""Processor for the RTE data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
# All datasets are converted to 2-class split, where for 3-class datasets we
# collapse neutral and contradiction into not_entailment.
return
[
"entailment"
,
"not_entailment"
]
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"RTE"
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
i
,
line
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
tokenization
.
convert_to_unicode
(
line
[
1
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
2
])
if
set_type
==
"test"
:
label
=
"entailment"
else
:
label
=
tokenization
.
convert_to_unicode
(
line
[
3
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
SstProcessor
(
DataProcessor
):
"""Processor for the SST-2 data set (GLUE version)."""
...
...
@@ -583,15 +748,16 @@ class TfdsProcessor(DataProcessor):
is_regression: Whether the task is a regression problem (defaults to False).
"""
def
__init__
(
self
,
tfds_params
,
def
__init__
(
self
,
tfds_params
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
super
(
TfdsProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
_process_tfds_params_str
(
tfds_params
)
if
self
.
module_import
:
importlib
.
import_module
(
self
.
module_import
)
self
.
dataset
,
info
=
tfds
.
load
(
self
.
dataset_name
,
data_dir
=
self
.
data_dir
,
with_info
=
True
)
self
.
dataset
,
info
=
tfds
.
load
(
self
.
dataset_name
,
data_dir
=
self
.
data_dir
,
with_info
=
True
)
if
self
.
is_regression
:
self
.
_labels
=
None
else
:
...
...
@@ -660,8 +826,57 @@ class TfdsProcessor(DataProcessor):
if
self
.
weight_key
:
weight
=
float
(
example
[
self
.
weight_key
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
,
weight
=
weight
))
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
,
weight
=
weight
))
return
examples
class
WnliProcessor
(
DataProcessor
):
"""Processor for the WNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"WNLI"
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
i
,
line
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
tokenization
.
convert_to_unicode
(
line
[
1
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
2
])
if
set_type
==
"test"
:
label
=
"0"
else
:
label
=
tokenization
.
convert_to_unicode
(
line
[
3
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
...
...
@@ -748,8 +963,9 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
logging
.
info
(
"input_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logging
.
info
(
"input_mask: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logging
.
info
(
"segment_ids: %s"
,
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
logging
.
info
(
"label: %s (id = %
d
)"
,
example
.
label
,
label_id
)
logging
.
info
(
"label: %s (id = %
s
)"
,
example
.
label
,
str
(
label_id
)
)
logging
.
info
(
"weight: %s"
,
example
.
weight
)
logging
.
info
(
"int_iden: %s"
,
str
(
example
.
int_iden
))
feature
=
InputFeatures
(
input_ids
=
input_ids
,
...
...
@@ -757,13 +973,18 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
segment_ids
=
segment_ids
,
label_id
=
label_id
,
is_real_example
=
True
,
weight
=
example
.
weight
)
weight
=
example
.
weight
,
int_iden
=
example
.
int_iden
)
return
feature
def
file_based_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
output_file
,
label_type
=
None
):
def
file_based_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
output_file
,
label_type
=
None
):
"""Convert a set of `InputExample`s to a TFRecord file."""
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
...
...
@@ -779,6 +1000,7 @@ def file_based_convert_examples_to_features(examples, label_list,
def
create_int_feature
(
values
):
f
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
f
def
create_float_feature
(
values
):
f
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
f
...
...
@@ -789,12 +1011,14 @@ def file_based_convert_examples_to_features(examples, label_list,
features
[
"segment_ids"
]
=
create_int_feature
(
feature
.
segment_ids
)
if
label_type
is
not
None
and
label_type
==
float
:
features
[
"label_ids"
]
=
create_float_feature
([
feature
.
label_id
])
el
s
e
:
el
if
feature
.
label_id
is
not
Non
e
:
features
[
"label_ids"
]
=
create_int_feature
([
feature
.
label_id
])
features
[
"is_real_example"
]
=
create_int_feature
(
[
int
(
feature
.
is_real_example
)])
if
feature
.
weight
is
not
None
:
features
[
"weight"
]
=
create_float_feature
([
feature
.
weight
])
if
feature
.
int_iden
is
not
None
:
features
[
"int_iden"
]
=
create_int_feature
([
feature
.
int_iden
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writer
.
write
(
tf_example
.
SerializeToString
())
...
...
@@ -830,8 +1054,7 @@ def generate_tf_record_from_data_file(processor,
Arguments:
processor: Input processor object to be used for generating data. Subclass
of `DataProcessor`.
data_dir: Directory that contains train/eval data to process. Data files
should be in from "dev.tsv", "test.tsv", or "train.tsv".
data_dir: Directory that contains train/eval/test data to process.
tokenizer: The tokenizer to be applied on the data.
train_data_output_path: Output to which processed tf record for training
will be saved.
...
...
@@ -857,8 +1080,7 @@ def generate_tf_record_from_data_file(processor,
train_input_data_examples
=
processor
.
get_train_examples
(
data_dir
)
file_based_convert_examples_to_features
(
train_input_data_examples
,
label_list
,
max_seq_length
,
tokenizer
,
train_data_output_path
,
label_type
)
train_data_output_path
,
label_type
)
num_training_data
=
len
(
train_input_data_examples
)
if
eval_data_output_path
:
...
...
@@ -868,26 +1090,27 @@ def generate_tf_record_from_data_file(processor,
tokenizer
,
eval_data_output_path
,
label_type
)
meta_data
=
{
"processor_type"
:
processor
.
get_processor_name
(),
"train_data_size"
:
num_training_data
,
"max_seq_length"
:
max_seq_length
,
}
if
test_data_output_path
:
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
if
isinstance
(
test_input_data_examples
,
dict
):
for
language
,
examples
in
test_input_data_examples
.
items
():
file_based_convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
test_data_output_path
.
format
(
language
),
label_type
)
examples
,
label_list
,
max_seq_length
,
tokenizer
,
test_data_output_path
.
format
(
language
),
label_type
)
meta_data
[
"test_{}_data_size"
.
format
(
language
)]
=
len
(
examples
)
else
:
file_based_convert_examples_to_features
(
test_input_data_examples
,
label_list
,
max_seq_length
,
tokenizer
,
test_data_output_path
,
label_type
)
meta_data
[
"test_data_size"
]
=
len
(
test_input_data_examples
)
meta_data
=
{
"processor_type"
:
processor
.
get_processor_name
(),
"train_data_size"
:
num_training_data
,
"max_seq_length"
:
max_seq_length
,
}
if
is_regression
:
meta_data
[
"task_type"
]
=
"bert_regression"
meta_data
[
"label_type"
]
=
{
int
:
"int"
,
float
:
"float"
}[
label_type
]
...
...
@@ -900,12 +1123,4 @@ def generate_tf_record_from_data_file(processor,
if
eval_data_output_path
:
meta_data
[
"eval_data_size"
]
=
len
(
eval_input_data_examples
)
if
test_data_output_path
:
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
if
isinstance
(
test_input_data_examples
,
dict
):
for
language
,
examples
in
test_input_data_examples
.
items
():
meta_data
[
"test_{}_data_size"
.
format
(
language
)]
=
len
(
examples
)
else
:
meta_data
[
"test_data_size"
]
=
len
(
test_input_data_examples
)
return
meta_data
official/nlp/data/create_finetuning_data.py
View file @
47bc1813
...
...
@@ -27,18 +27,21 @@ from absl import flags
import
tensorflow
as
tf
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
classifier_data_lib
from
official.nlp.data
import
sentence_retrieval_lib
# word-piece tokenizer based squad_lib
from
official.nlp.data
import
squad_lib
as
squad_lib_wp
# sentence-piece tokenizer based squad_lib
from
official.nlp.data
import
squad_lib_sp
from
official.nlp.data
import
tagging_data_lib
FLAGS
=
flags
.
FLAGS
# TODO(chendouble): consider moving each task to its own binary.
flags
.
DEFINE_enum
(
"fine_tuning_task_type"
,
"classification"
,
[
"classification"
,
"regression"
,
"squad"
],
[
"classification"
,
"regression"
,
"squad"
,
"retrieval"
,
"tagging"
],
"The name of the BERT fine tuning task for which data "
"will be generated.
.
"
)
"will be generated."
)
# BERT classification specific flags.
flags
.
DEFINE_string
(
...
...
@@ -48,8 +51,12 @@ flags.DEFINE_string(
flags
.
DEFINE_enum
(
"classification_task_name"
,
"MNLI"
,
[
"COLA"
,
"MNLI"
,
"MRPC"
,
"QNLI"
,
"QQP"
,
"SST-2"
,
"XNLI"
,
"PAWS-X"
],
"The name of the task to train BERT classifier."
)
"PAWS-X"
,
"XTREME-XNLI"
,
"XTREME-PAWS-X"
],
"The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english "
"only and for XNLI is all languages combined. Same for "
"PAWS-X."
)
# XNLI task specific flag.
flags
.
DEFINE_string
(
...
...
@@ -63,6 +70,14 @@ flags.DEFINE_string(
"Language of trainig data for PAWS-X task. If the value is 'all', the data "
"of all languages will be used for training."
)
# Retrieva task specific flags
flags
.
DEFINE_enum
(
"retrieval_task_name"
,
"bucc"
,
[
"bucc"
,
"tatoeba"
],
"The name of sentence retrieval task for scoring"
)
# Tagging task specific flags
flags
.
DEFINE_enum
(
"tagging_task_name"
,
"panx"
,
[
"panx"
,
"udpos"
],
"The name of BERT tagging (token classification) task."
)
# BERT Squad task specific flags.
flags
.
DEFINE_string
(
"squad_data_file"
,
None
,
...
...
@@ -169,6 +184,7 @@ def generate_classifier_dataset():
"qnli"
:
classifier_data_lib
.
QnliProcessor
,
"qqp"
:
classifier_data_lib
.
QqpProcessor
,
"rte"
:
classifier_data_lib
.
RteProcessor
,
"sst-2"
:
classifier_data_lib
.
SstProcessor
,
"xnli"
:
...
...
@@ -176,7 +192,12 @@ def generate_classifier_dataset():
language
=
FLAGS
.
xnli_language
),
"paws-x"
:
functools
.
partial
(
classifier_data_lib
.
PawsxProcessor
,
language
=
FLAGS
.
pawsx_language
)
language
=
FLAGS
.
pawsx_language
),
"wnli"
:
classifier_data_lib
.
WnliProcessor
,
"xtreme-xnli"
:
functools
.
partial
(
classifier_data_lib
.
XtremeXnliProcessor
),
"xtreme-paws-x"
:
functools
.
partial
(
classifier_data_lib
.
XtremePawsxProcessor
)
}
task_name
=
FLAGS
.
classification_task_name
.
lower
()
if
task_name
not
in
processors
:
...
...
@@ -237,6 +258,67 @@ def generate_squad_dataset():
FLAGS
.
max_query_length
,
FLAGS
.
doc_stride
,
FLAGS
.
version_2_with_negative
)
def
generate_retrieval_dataset
():
"""Generate retrieval test and dev dataset and returns input meta data."""
assert
(
FLAGS
.
input_data_dir
and
FLAGS
.
retrieval_task_name
)
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
FLAGS
.
vocab_file
,
do_lower_case
=
FLAGS
.
do_lower_case
)
processor_text_fn
=
tokenization
.
convert_to_unicode
else
:
assert
FLAGS
.
tokenizer_impl
==
"sentence_piece"
tokenizer
=
tokenization
.
FullSentencePieceTokenizer
(
FLAGS
.
sp_model_file
)
processor_text_fn
=
functools
.
partial
(
tokenization
.
preprocess_text
,
lower
=
FLAGS
.
do_lower_case
)
processors
=
{
"bucc"
:
sentence_retrieval_lib
.
BuccProcessor
,
"tatoeba"
:
sentence_retrieval_lib
.
TatoebaProcessor
,
}
task_name
=
FLAGS
.
retrieval_task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
task_name
)
processor
=
processors
[
task_name
](
process_text_fn
=
processor_text_fn
)
return
sentence_retrieval_lib
.
generate_sentence_retrevial_tf_record
(
processor
,
FLAGS
.
input_data_dir
,
tokenizer
,
FLAGS
.
eval_data_output_path
,
FLAGS
.
test_data_output_path
,
FLAGS
.
max_seq_length
)
def
generate_tagging_dataset
():
"""Generates tagging dataset."""
processors
=
{
"panx"
:
tagging_data_lib
.
PanxProcessor
,
"udpos"
:
tagging_data_lib
.
UdposProcessor
,
}
task_name
=
FLAGS
.
tagging_task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
task_name
)
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
FLAGS
.
vocab_file
,
do_lower_case
=
FLAGS
.
do_lower_case
)
processor_text_fn
=
tokenization
.
convert_to_unicode
elif
FLAGS
.
tokenizer_impl
==
"sentence_piece"
:
tokenizer
=
tokenization
.
FullSentencePieceTokenizer
(
FLAGS
.
sp_model_file
)
processor_text_fn
=
functools
.
partial
(
tokenization
.
preprocess_text
,
lower
=
FLAGS
.
do_lower_case
)
else
:
raise
ValueError
(
"Unsupported tokenizer_impl: %s"
%
FLAGS
.
tokenizer_impl
)
processor
=
processors
[
task_name
]()
return
tagging_data_lib
.
generate_tf_record_from_data_file
(
processor
,
FLAGS
.
input_data_dir
,
tokenizer
,
FLAGS
.
max_seq_length
,
FLAGS
.
train_data_output_path
,
FLAGS
.
eval_data_output_path
,
FLAGS
.
test_data_output_path
,
processor_text_fn
)
def
main
(
_
):
if
FLAGS
.
tokenizer_impl
==
"word_piece"
:
if
not
FLAGS
.
vocab_file
:
...
...
@@ -248,12 +330,20 @@ def main(_):
raise
ValueError
(
"FLAG sp_model_file for sentence-piece tokenizer is not specified."
)
if
FLAGS
.
fine_tuning_task_type
!=
"retrieval"
:
flags
.
mark_flag_as_required
(
"train_data_output_path"
)
if
FLAGS
.
fine_tuning_task_type
==
"classification"
:
input_meta_data
=
generate_classifier_dataset
()
elif
FLAGS
.
fine_tuning_task_type
==
"regression"
:
input_meta_data
=
generate_regression_dataset
()
else
:
elif
FLAGS
.
fine_tuning_task_type
==
"retrieval"
:
input_meta_data
=
generate_retrieval_dataset
()
elif
FLAGS
.
fine_tuning_task_type
==
"squad"
:
input_meta_data
=
generate_squad_dataset
()
else
:
assert
FLAGS
.
fine_tuning_task_type
==
"tagging"
input_meta_data
=
generate_tagging_dataset
()
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
FLAGS
.
meta_data_file_path
))
with
tf
.
io
.
gfile
.
GFile
(
FLAGS
.
meta_data_file_path
,
"w"
)
as
writer
:
...
...
@@ -261,6 +351,5 @@ def main(_):
if
__name__
==
"__main__"
:
flags
.
mark_flag_as_required
(
"train_data_output_path"
)
flags
.
mark_flag_as_required
(
"meta_data_file_path"
)
app
.
run
(
main
)
official/nlp/data/sentence_retrieval_lib.py
0 → 100644
View file @
47bc1813
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""BERT library to process data for cross lingual sentence retrieval task."""
import
os
from
absl
import
logging
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
classifier_data_lib
class
BuccProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Procssor for Xtreme BUCC data set."""
supported_languages
=
[
"de"
,
"fr"
,
"ru"
,
"zh"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
super
(
BuccProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
languages
=
BuccProcessor
.
supported_languages
def
get_dev_examples
(
self
,
data_dir
,
file_pattern
):
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
file_pattern
.
format
(
"dev"
))),
"sample"
)
def
get_test_examples
(
self
,
data_dir
,
file_pattern
):
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
file_pattern
.
format
(
"test"
))),
"test"
)
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"BUCC"
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
int_iden
=
int
(
line
[
0
].
split
(
"-"
)[
1
])
text_a
=
self
.
process_text_fn
(
line
[
1
])
examples
.
append
(
classifier_data_lib
.
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
int_iden
=
int_iden
))
return
examples
class
TatoebaProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Procssor for Xtreme Tatoeba data set."""
supported_languages
=
[
"af"
,
"ar"
,
"bg"
,
"bn"
,
"de"
,
"el"
,
"es"
,
"et"
,
"eu"
,
"fa"
,
"fi"
,
"fr"
,
"he"
,
"hi"
,
"hu"
,
"id"
,
"it"
,
"ja"
,
"jv"
,
"ka"
,
"kk"
,
"ko"
,
"ml"
,
"mr"
,
"nl"
,
"pt"
,
"ru"
,
"sw"
,
"ta"
,
"te"
,
"th"
,
"tl"
,
"tr"
,
"ur"
,
"vi"
,
"zh"
]
def
__init__
(
self
,
process_text_fn
=
tokenization
.
convert_to_unicode
):
super
(
TatoebaProcessor
,
self
).
__init__
(
process_text_fn
)
self
.
languages
=
TatoebaProcessor
.
supported_languages
def
get_test_examples
(
self
,
data_dir
,
file_path
):
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
file_path
)),
"test"
)
@
staticmethod
def
get_processor_name
():
"""See base class."""
return
"TATOEBA"
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
self
.
process_text_fn
(
line
[
0
])
examples
.
append
(
classifier_data_lib
.
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
int_iden
=
i
))
return
examples
def
generate_sentence_retrevial_tf_record
(
processor
,
data_dir
,
tokenizer
,
eval_data_output_path
=
None
,
test_data_output_path
=
None
,
max_seq_length
=
128
):
"""Generates the tf records for retrieval tasks.
Args:
processor: Input processor object to be used for generating data. Subclass
of `DataProcessor`.
data_dir: Directory that contains train/eval data to process. Data files
should be in from.
tokenizer: The tokenizer to be applied on the data.
eval_data_output_path: Output to which processed tf record for evaluation
will be saved.
test_data_output_path: Output to which processed tf record for testing
will be saved. Must be a pattern template with {} if processor has
language specific test data.
max_seq_length: Maximum sequence length of the to be generated
training/eval data.
Returns:
A dictionary containing input meta data.
"""
assert
eval_data_output_path
or
test_data_output_path
if
processor
.
get_processor_name
()
==
"BUCC"
:
path_pattern
=
"{}-en.{{}}.{}"
if
processor
.
get_processor_name
()
==
"TATOEBA"
:
path_pattern
=
"{}-en.{}"
meta_data
=
{
"processor_type"
:
processor
.
get_processor_name
(),
"max_seq_length"
:
max_seq_length
,
"number_eval_data"
:
{},
"number_test_data"
:
{},
}
logging
.
info
(
"Start to process %s task data"
,
processor
.
get_processor_name
())
for
lang_a
in
processor
.
languages
:
for
lang_b
in
[
lang_a
,
"en"
]:
if
eval_data_output_path
:
eval_input_data_examples
=
processor
.
get_dev_examples
(
data_dir
,
os
.
path
.
join
(
path_pattern
.
format
(
lang_a
,
lang_b
)))
num_eval_data
=
len
(
eval_input_data_examples
)
logging
.
info
(
"Processing %d dev examples of %s-en.%s"
,
num_eval_data
,
lang_a
,
lang_b
)
output_file
=
os
.
path
.
join
(
eval_data_output_path
,
"{}-en-{}.{}.tfrecords"
.
format
(
lang_a
,
lang_b
,
"dev"
))
classifier_data_lib
.
file_based_convert_examples_to_features
(
eval_input_data_examples
,
None
,
max_seq_length
,
tokenizer
,
output_file
,
None
)
meta_data
[
"number_eval_data"
][
f
"
{
lang_a
}
-en.
{
lang_b
}
"
]
=
num_eval_data
if
test_data_output_path
:
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
,
os
.
path
.
join
(
path_pattern
.
format
(
lang_a
,
lang_b
)))
num_test_data
=
len
(
test_input_data_examples
)
logging
.
info
(
"Processing %d test examples of %s-en.%s"
,
num_test_data
,
lang_a
,
lang_b
)
output_file
=
os
.
path
.
join
(
test_data_output_path
,
"{}-en-{}.{}.tfrecords"
.
format
(
lang_a
,
lang_b
,
"test"
))
classifier_data_lib
.
file_based_convert_examples_to_features
(
test_input_data_examples
,
None
,
max_seq_length
,
tokenizer
,
output_file
,
None
)
meta_data
[
"number_test_data"
][
f
"
{
lang_a
}
-en.
{
lang_b
}
"
]
=
num_test_data
return
meta_data
official/nlp/data/tagging_data_lib.py
0 → 100644
View file @
47bc1813
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library to process data for tagging task such as NER/POS."""
import
collections
import
os
from
absl
import
logging
import
tensorflow
as
tf
from
official.nlp.data
import
classifier_data_lib
# A negative label id for the padding label, which will not contribute
# to loss/metrics in training.
_PADDING_LABEL_ID
=
-
1
# The special unknown token, used to substitute a word which has too many
# subwords after tokenization.
_UNK_TOKEN
=
"[UNK]"
class
InputExample
(
object
):
"""A single training/test example for token classification."""
def
__init__
(
self
,
sentence_id
,
words
=
None
,
label_ids
=
None
):
"""Constructs an InputExample."""
self
.
sentence_id
=
sentence_id
self
.
words
=
words
if
words
else
[]
self
.
label_ids
=
label_ids
if
label_ids
else
[]
def
add_word_and_label_id
(
self
,
word
,
label_id
):
"""Adds word and label_id pair in the example."""
self
.
words
.
append
(
word
)
self
.
label_ids
.
append
(
label_id
)
def
_read_one_file
(
file_name
,
label_list
):
"""Reads one file and returns a list of `InputExample` instances."""
lines
=
tf
.
io
.
gfile
.
GFile
(
file_name
,
"r"
).
readlines
()
examples
=
[]
label_id_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
sentence_id
=
0
example
=
InputExample
(
sentence_id
=
0
)
for
line
in
lines
:
line
=
line
.
strip
(
"
\n
"
)
if
line
:
# The format is: <token>\t<label> for train/dev set and <token> for test.
items
=
line
.
split
(
"
\t
"
)
assert
len
(
items
)
==
2
or
len
(
items
)
==
1
token
=
items
[
0
].
strip
()
# Assign a dummy label_id for test set
label_id
=
label_id_map
[
items
[
1
].
strip
()]
if
len
(
items
)
==
2
else
0
example
.
add_word_and_label_id
(
token
,
label_id
)
else
:
# Empty line indicates a new sentence.
if
example
.
words
:
examples
.
append
(
example
)
sentence_id
+=
1
example
=
InputExample
(
sentence_id
=
sentence_id
)
if
example
.
words
:
examples
.
append
(
example
)
return
examples
class
PanxProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Processor for the Panx data set."""
supported_languages
=
[
"ar"
,
"he"
,
"vi"
,
"id"
,
"jv"
,
"ms"
,
"tl"
,
"eu"
,
"ml"
,
"ta"
,
"te"
,
"af"
,
"nl"
,
"en"
,
"de"
,
"el"
,
"bn"
,
"hi"
,
"mr"
,
"ur"
,
"fa"
,
"fr"
,
"it"
,
"pt"
,
"es"
,
"bg"
,
"ru"
,
"ja"
,
"ka"
,
"ko"
,
"th"
,
"sw"
,
"yo"
,
"my"
,
"zh"
,
"kk"
,
"tr"
,
"et"
,
"fi"
,
"hu"
]
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
for
language
in
self
.
supported_languages
:
examples_dict
[
language
]
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"test-%s.tsv"
%
language
),
self
.
get_labels
())
return
examples_dict
def
get_labels
(
self
):
return
[
"O"
,
"B-PER"
,
"I-PER"
,
"B-LOC"
,
"I-LOC"
,
"B-ORG"
,
"I-ORG"
]
@
staticmethod
def
get_processor_name
():
return
"panx"
class
UdposProcessor
(
classifier_data_lib
.
DataProcessor
):
"""Processor for the Udpos data set."""
supported_languages
=
[
"af"
,
"ar"
,
"bg"
,
"de"
,
"el"
,
"en"
,
"es"
,
"et"
,
"eu"
,
"fa"
,
"fi"
,
"fr"
,
"he"
,
"hi"
,
"hu"
,
"id"
,
"it"
,
"ja"
,
"kk"
,
"ko"
,
"mr"
,
"nl"
,
"pt"
,
"ru"
,
"ta"
,
"te"
,
"th"
,
"tl"
,
"tr"
,
"ur"
,
"vi"
,
"yo"
,
"zh"
]
def
get_train_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"train-en.tsv"
),
self
.
get_labels
())
def
get_dev_examples
(
self
,
data_dir
):
return
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"dev-en.tsv"
),
self
.
get_labels
())
def
get_test_examples
(
self
,
data_dir
):
examples_dict
=
{}
for
language
in
self
.
supported_languages
:
examples_dict
[
language
]
=
_read_one_file
(
os
.
path
.
join
(
data_dir
,
"test-%s.tsv"
%
language
),
self
.
get_labels
())
return
examples_dict
def
get_labels
(
self
):
return
[
"ADJ"
,
"ADP"
,
"ADV"
,
"AUX"
,
"CCONJ"
,
"DET"
,
"INTJ"
,
"NOUN"
,
"NUM"
,
"PART"
,
"PRON"
,
"PROPN"
,
"PUNCT"
,
"SCONJ"
,
"SYM"
,
"VERB"
,
"X"
]
@
staticmethod
def
get_processor_name
():
return
"udpos"
def
_tokenize_example
(
example
,
max_length
,
tokenizer
,
text_preprocessing
=
None
):
"""Tokenizes words and breaks long example into short ones."""
# Needs additional [CLS] and [SEP] tokens.
max_length
=
max_length
-
2
new_examples
=
[]
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
for
i
,
word
in
enumerate
(
example
.
words
):
if
any
([
x
<
0
for
x
in
example
.
label_ids
]):
raise
ValueError
(
"Unexpected negative label_id: %s"
%
example
.
label_ids
)
if
text_preprocessing
:
word
=
text_preprocessing
(
word
)
subwords
=
tokenizer
.
tokenize
(
word
)
if
(
not
subwords
or
len
(
subwords
)
>
max_length
)
and
word
:
subwords
=
[
_UNK_TOKEN
]
if
len
(
subwords
)
+
len
(
new_example
.
words
)
>
max_length
:
# Start a new example.
new_examples
.
append
(
new_example
)
new_example
=
InputExample
(
sentence_id
=
example
.
sentence_id
)
for
j
,
subword
in
enumerate
(
subwords
):
# Use the real label for the first subword, and pad label for
# the remainings.
subword_label
=
example
.
label_ids
[
i
]
if
j
==
0
else
_PADDING_LABEL_ID
new_example
.
add_word_and_label_id
(
subword
,
subword_label
)
if
new_example
.
words
:
new_examples
.
append
(
new_example
)
return
new_examples
def
_convert_single_example
(
example
,
max_seq_length
,
tokenizer
):
"""Converts an `InputExample` instance to a `tf.train.Example` instance."""
tokens
=
[
"[CLS]"
]
tokens
.
extend
(
example
.
words
)
tokens
.
append
(
"[SEP]"
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
label_ids
=
[
_PADDING_LABEL_ID
]
label_ids
.
extend
(
example
.
label_ids
)
label_ids
.
append
(
_PADDING_LABEL_ID
)
segment_ids
=
[
0
]
*
len
(
input_ids
)
input_mask
=
[
1
]
*
len
(
input_ids
)
# Pad up to the sequence length.
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
label_ids
.
append
(
_PADDING_LABEL_ID
)
def
create_int_feature
(
values
):
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"label_ids"
]
=
create_int_feature
(
label_ids
)
features
[
"sentence_id"
]
=
create_int_feature
([
example
.
sentence_id
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
return
tf_example
def
write_example_to_file
(
examples
,
tokenizer
,
max_seq_length
,
output_file
,
text_preprocessing
=
None
):
"""Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
Note that the words inside each example will be tokenized and be applied by
`text_preprocessing` if available. Also, if the length of sentence (plus
special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
will be broken into multiple short examples. For example:
Example (text_preprocessing=lowercase, max_seq_length=5)
words: ["What", "a", "great", "weekend"]
labels: [ 7, 5, 9, 10]
sentence_id: 0
preprocessed: ["what", "a", "great", "weekend"]
tokenized: ["what", "a", "great", "week", "##end"]
will result in two tf.example protos:
tokens: ["[CLS]", "what", "a", "great", "[SEP]"]
label_ids: [-1, 7, 5, 9, -1]
input_mask: [ 1, 1, 1, 1, 1]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
tokens: ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
label_ids: [-1, 10, -1, -1, -1]
input_mask: [ 1, 1, 1, 0, 0]
segment_ids: [ 0, 0, 0, 0, 0]
input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
sentence_id: 0
Note the use of -1 in `label_ids` to indicate that a token should not be
considered for classification (e.g., trailing ## wordpieces or special
token). Token classification models should accordingly ignore these when
calculating loss, metrics, etc...
Args:
examples: A list of `InputExample` instances.
tokenizer: The tokenizer to be applied on the data.
max_seq_length: Maximum length of generated sequences.
output_file: The name of the output tfrecord file.
text_preprocessing: optional preprocessing run on each word prior to
tokenization.
Returns:
The total number of tf.train.Example proto written to file.
"""
tf
.
io
.
gfile
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
writer
=
tf
.
io
.
TFRecordWriter
(
output_file
)
num_tokenized_examples
=
0
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
logging
.
info
(
"Writing example %d of %d to %s"
,
ex_index
,
len
(
examples
),
output_file
)
tokenized_examples
=
_tokenize_example
(
example
,
max_seq_length
,
tokenizer
,
text_preprocessing
)
num_tokenized_examples
+=
len
(
tokenized_examples
)
for
per_tokenized_example
in
tokenized_examples
:
tf_example
=
_convert_single_example
(
per_tokenized_example
,
max_seq_length
,
tokenizer
)
writer
.
write
(
tf_example
.
SerializeToString
())
writer
.
close
()
return
num_tokenized_examples
def
token_classification_meta_data
(
train_data_size
,
max_seq_length
,
num_labels
,
eval_data_size
=
None
,
test_data_size
=
None
,
label_list
=
None
,
processor_type
=
None
):
"""Creates metadata for tagging (token classification) datasets."""
meta_data
=
{
"train_data_size"
:
train_data_size
,
"max_seq_length"
:
max_seq_length
,
"num_labels"
:
num_labels
,
"task_type"
:
"tagging"
,
"label_type"
:
"int"
,
"label_shape"
:
[
max_seq_length
],
}
if
eval_data_size
:
meta_data
[
"eval_data_size"
]
=
eval_data_size
if
test_data_size
:
meta_data
[
"test_data_size"
]
=
test_data_size
if
label_list
:
meta_data
[
"label_list"
]
=
label_list
if
processor_type
:
meta_data
[
"processor_type"
]
=
processor_type
return
meta_data
def
generate_tf_record_from_data_file
(
processor
,
data_dir
,
tokenizer
,
max_seq_length
,
train_data_output_path
,
eval_data_output_path
,
test_data_output_path
,
text_preprocessing
):
"""Generates tfrecord files from the raw data."""
common_kwargs
=
dict
(
tokenizer
=
tokenizer
,
max_seq_length
=
max_seq_length
,
text_preprocessing
=
text_preprocessing
)
train_examples
=
processor
.
get_train_examples
(
data_dir
)
train_data_size
=
write_example_to_file
(
train_examples
,
output_file
=
train_data_output_path
,
**
common_kwargs
)
eval_examples
=
processor
.
get_dev_examples
(
data_dir
)
eval_data_size
=
write_example_to_file
(
eval_examples
,
output_file
=
eval_data_output_path
,
**
common_kwargs
)
test_input_data_examples
=
processor
.
get_test_examples
(
data_dir
)
test_data_size
=
{}
for
language
,
examples
in
test_input_data_examples
.
items
():
test_data_size
[
language
]
=
write_example_to_file
(
examples
,
output_file
=
test_data_output_path
.
format
(
language
),
**
common_kwargs
)
labels
=
processor
.
get_labels
()
meta_data
=
token_classification_meta_data
(
train_data_size
,
max_seq_length
,
len
(
labels
),
eval_data_size
,
test_data_size
,
label_list
=
labels
,
processor_type
=
processor
.
get_processor_name
())
return
meta_data
official/nlp/data/tagging_data_loader.py
0 → 100644
View file @
47bc1813
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads dataset for the tagging (e.g., NER/POS) task."""
from
typing
import
Mapping
,
Optional
import
tensorflow
as
tf
from
official.core
import
input_reader
class
TaggingDataLoader
:
"""A class to load dataset for tagging (e.g., NER and POS) task."""
def
__init__
(
self
,
params
):
self
.
_params
=
params
self
.
_seq_length
=
params
.
seq_length
def
_decode
(
self
,
record
:
tf
.
Tensor
):
"""Decodes a serialized tf.Example."""
name_to_features
=
{
'input_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'input_mask'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'segment_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
'label_ids'
:
tf
.
io
.
FixedLenFeature
([
self
.
_seq_length
],
tf
.
int64
),
}
example
=
tf
.
io
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
example
:
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
cast
(
t
,
tf
.
int32
)
example
[
name
]
=
t
return
example
def
_parse
(
self
,
record
:
Mapping
[
str
,
tf
.
Tensor
]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x
=
{
'input_word_ids'
:
record
[
'input_ids'
],
'input_mask'
:
record
[
'input_mask'
],
'input_type_ids'
:
record
[
'segment_ids'
]
}
y
=
record
[
'label_ids'
]
return
(
x
,
y
)
def
load
(
self
,
input_context
:
Optional
[
tf
.
distribute
.
InputContext
]
=
None
):
"""Returns a tf.dataset.Dataset."""
reader
=
input_reader
.
InputReader
(
params
=
self
.
_params
,
decoder_fn
=
self
.
_decode
,
parser_fn
=
self
.
_parse
)
return
reader
.
read
(
input_context
)
official/nlp/modeling/layers/README.md
View file @
47bc1813
...
...
@@ -9,13 +9,17 @@ assemble new layers, networks, or models.
initialization parameters.
*
[
MultiHeadAttention
](
attention.py
)
implements an optionally masked attention
between
two tensors, from_tensor and to_
tensor
,
as described in
between
query, key, value
tensor
s
as described in
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
. If
`from_tensor`
and
`to_tensor`
are the same, then this is self-attention.
*
[
CachedAttention
](
attention.py
)
implements an attention layer with cache
used for auto-agressive decoding.
*
[
MultiChannelAttention
](
multi_channel_attention.py
)
implements an variant of
multi-head attention which can be used to merge multiple streams for
cross-attentions.
*
[
TalkingHeadsAttention
](
talking_heads_attention.py
)
implements the talking
heads attention, as decribed in
[
"Talking-Heads Attention"
](
https://arxiv.org/abs/2003.02436
)
.
...
...
@@ -24,6 +28,10 @@ assemble new layers, networks, or models.
described in
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
.
*
[
TransformerDecoderLayer
](
transformer.py
)
TransformerDecoderLayer is made up
of self multi-head attention, cross multi-head attention and
feedforward network.
*
[
ReZeroTransformer
](
rezero_transformer.py
)
implements Transformer with
ReZero described in
[
"ReZero is All You Need: Fast Convergence at Large Depth"
](
https://arxiv.org/abs/2003.04887
)
.
...
...
@@ -45,6 +53,9 @@ assemble new layers, networks, or models.
should be masked), the output will have masked positions set to
approximately zero.
*
[
`MaskedLM`
](
masked_lm.py
)
implements a masked language model. It assumes
the embedding table variable is passed to it.
*
[
ClassificationHead
](
cls_head.py
)
A pooling head over a sequence of
embeddings, commonly used by classification tasks.
...
...
official/nlp/modeling/layers/__init__.py
View file @
47bc1813
...
...
@@ -18,11 +18,13 @@ from official.nlp.modeling.layers.attention import *
from
official.nlp.modeling.layers.cls_head
import
*
from
official.nlp.modeling.layers.dense_einsum
import
DenseEinsum
from
official.nlp.modeling.layers.gated_feedforward
import
GatedFeedforward
from
official.nlp.modeling.layers.masked_lm
import
MaskedLM
from
official.nlp.modeling.layers.masked_softmax
import
MaskedSoftmax
from
official.nlp.modeling.layers.multi_channel_attention
import
*
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.position_embedding
import
PositionEmbedding
from
official.nlp.modeling.layers.rezero_transformer
import
ReZeroTransformer
from
official.nlp.modeling.layers.self_attention_mask
import
SelfAttentionMask
from
official.nlp.modeling.layers.talking_heads_attention
import
TalkingHeadsAttention
from
official.nlp.modeling.layers.transformer
import
Transformer
from
official.nlp.modeling.layers.transformer
import
*
from
official.nlp.modeling.layers.transformer_scaffold
import
TransformerScaffold
official/nlp/modeling/
network
s/masked_lm.py
→
official/nlp/modeling/
layer
s/masked_lm.py
View file @
47bc1813
...
...
@@ -25,91 +25,74 @@ from official.modeling import tf_utils
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MaskedLM
(
tf
.
keras
.
Model
):
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This network implements a masked language model based on the provided network.
It assumes that the network being passed has a "get_embedding_table()" method.
Arguments:
input_width: The innermost dimension of the input tensor to this network.
num_predictions: The number of predictions to make per sequence.
source_network: The network with the embedding layer to use for the
embedding layer.
embedding_table: The embedding table of a source network, If None, the
`source_network.get_embedding_table()` method is used.
activation: The activation, if any, for the dense layer in this network.
initializer: The intializer for the dense layer in this network. Defaults to
a Glorot uniform initializer.
embedding_table: The embedding table of the targets.
activation: The activation, if any, for the dense layer.
initializer: The intializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this network. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
input_width
,
num_predictions
,
source_network
,
embedding_table
=
None
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
'cls/predictions'
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
embedding_table
is
None
:
embedding_table
=
source_network
.
get_embedding_table
()
vocab_size
,
hidden_size
=
embedding_table
.
shape
sequence_data
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,
input_width
),
name
=
'sequence_data'
,
dtype
=
tf
.
float32
)
masked_lm_positions
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
num_predictions
,),
name
=
'masked_lm_positions'
,
dtype
=
tf
.
int32
)
masked_lm_input
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
self
.
_gather_indexes
(
x
[
0
],
x
[
1
]))(
[
sequence_data
,
masked_lm_positions
])
lm_data
=
(
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
activation
,
kernel_initializer
=
initializer
,
name
=
'cls/predictions/transform/dense'
)(
masked_lm_input
))
lm_data
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'cls/predictions/transform/LayerNorm'
)(
lm_data
)
lm_data
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
tf
.
matmul
(
x
,
embedding_table
,
transpose_b
=
True
))(
lm_data
)
logits
=
Bias
(
initializer
=
tf
.
keras
.
initializers
.
Zeros
(),
name
=
'cls/predictions/output_bias'
)(
lm_data
)
# We can't use the standard Keras reshape layer here, since it expects
# the input and output batch size to be the same.
reshape_layer
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
tf
.
reshape
(
x
,
[
-
1
,
num_predictions
,
vocab_size
]))
self
.
logits
=
reshape_layer
(
logits
)
predictions
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
self
.
logits
)
if
output
==
'logits'
:
output_tensors
=
self
.
logits
elif
output
==
'predictions'
:
output_tensors
=
predictions
else
:
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
super
(
MaskedLM
,
self
).
__init__
(
inputs
=
[
sequence_data
,
masked_lm_positions
],
outputs
=
output_tensors
,
**
kwargs
)
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_shape
=
tf_utils
.
get_shape_list
(
masked_positions
,
name
=
'masked_positions_tensor'
)
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_shape
[
1
],
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized at this '
'time. Please use it only in Layers or '
'functionally subclassed Models/Networks.'
)
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions.
...
...
@@ -139,51 +122,3 @@ class MaskedLM(tf.keras.Model):
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
# Temporary until we can create a Dense layer that ties the embedding.
class
Bias
(
tf
.
keras
.
layers
.
Layer
):
"""Adds a bias term to an input."""
def
__init__
(
self
,
initializer
=
'zeros'
,
regularizer
=
None
,
constraint
=
None
,
activation
=
None
,
**
kwargs
):
super
(
Bias
,
self
).
__init__
(
**
kwargs
)
self
.
_initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
_regularizer
=
tf
.
keras
.
regularizers
.
get
(
regularizer
)
self
.
_constraint
=
tf
.
keras
.
constraints
.
get
(
constraint
)
self
.
_activation
=
tf
.
keras
.
activations
.
get
(
activation
)
def
build
(
self
,
input_shape
):
input_shape
=
tf
.
TensorShape
(
input_shape
)
self
.
_bias
=
self
.
add_weight
(
'bias'
,
shape
=
input_shape
[
1
:],
initializer
=
self
.
_initializer
,
regularizer
=
self
.
_regularizer
,
constraint
=
self
.
_constraint
,
dtype
=
self
.
_dtype
,
trainable
=
True
)
super
(
Bias
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
'activation'
:
tf
.
keras
.
activations
.
serialize
(
self
.
_activation
),
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_initializer
),
'regularizer'
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_regularizer
),
'constraint'
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_constraint
)
}
base_config
=
super
(
Bias
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
outputs
=
tf
.
nn
.
bias_add
(
inputs
,
self
.
_bias
)
if
self
.
_activation
is
not
None
:
return
self
.
_activation
(
outputs
)
# pylint: disable=not-callable
else
:
return
outputs
official/nlp/modeling/
network
s/masked_lm_test.py
→
official/nlp/modeling/
layer
s/masked_lm_test.py
View file @
47bc1813
...
...
@@ -23,7 +23,7 @@ import tensorflow as tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.
network
s
import
masked_lm
from
official.nlp.modeling.
layer
s
import
masked_lm
from
official.nlp.modeling.networks
import
transformer_encoder
...
...
@@ -32,13 +32,12 @@ from official.nlp.modeling.networks import transformer_encoder
@
keras_parameterized
.
run_all_keras_modes
class
MaskedLMTest
(
keras_parameterized
.
TestCase
):
def
create_network
(
self
,
vocab_size
,
sequence_length
,
hidden_size
,
num_predictions
,
output
=
'predictions'
,
xformer_stack
=
None
):
def
create_layer
(
self
,
vocab_size
,
sequence_length
,
hidden_size
,
output
=
'predictions'
,
xformer_stack
=
None
):
# First, create a transformer stack that we can use to get the LM's
# vocabulary weight.
if
xformer_stack
is
None
:
...
...
@@ -49,82 +48,32 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
)
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
lm_outputs
,
_
=
xformer_stack
([
word_ids
,
mask
,
type_ids
])
# Create a maskedLM from the transformer stack.
test_network
=
masked_lm
.
MaskedLM
(
num_predictions
=
num_predictions
,
input_width
=
lm_outputs
.
shape
[
-
1
],
source_network
=
xformer_stack
,
test_layer
=
masked_lm
.
MaskedLM
(
embedding_table
=
xformer_stack
.
get_embedding_table
(),
output
=
output
)
return
test_
network
return
test_
layer
def
test_
network
_creation
(
self
):
def
test_
layer
_creation
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
num_predictions
=
21
test_
network
=
self
.
create_
network
(
test_
layer
=
self
.
create_
layer
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
)
hidden_size
=
hidden_size
)
# Make sure that the output tensor of the masked LM is the right shape.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_lm_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_network
([
lm_input_tensor
,
masked_lm_positions
])
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
=
masked_positions
)
expected_output_shape
=
[
None
,
num_predictions
,
vocab_size
]
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
def
test_network_invocation_with_internal_logits
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
num_predictions
=
21
test_network
=
self
.
create_network
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
)
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_lm_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_network
([
lm_input_tensor
,
masked_lm_positions
])
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_lm_positions
],
output
)
logits_model
=
tf
.
keras
.
Model
(
test_network
.
inputs
,
test_network
.
logits
)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
batch_size
=
3
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
outputs
=
model
.
predict
([
lm_input_data
,
masked_position_data
])
logits
=
logits_model
.
predict
([
lm_input_data
,
masked_position_data
])
# Ensure that the tensor shapes are correct.
expected_output_shape
=
(
batch_size
,
num_predictions
,
vocab_size
)
self
.
assertEqual
(
expected_output_shape
,
outputs
.
shape
)
self
.
assertEqual
(
expected_output_shape
,
logits
.
shape
)
# Ensure that the logits, when softmaxed, create the outputs.
input_tensor
=
tf
.
keras
.
Input
(
expected_output_shape
[
1
:])
output_tensor
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
input_tensor
)
softmax_model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
calculated_softmax
=
softmax_model
.
predict
(
logits
)
self
.
assertAllClose
(
outputs
,
calculated_softmax
)
def
test_network_invocation_with_external_logits
(
self
):
def
test_layer_invocation_with_external_logits
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
...
...
@@ -136,31 +85,28 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
)
test_
network
=
self
.
create_
network
(
test_
layer
=
self
.
create_
layer
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
,
xformer_stack
=
xformer_stack
,
output
=
'predictions'
)
logit_
network
=
self
.
create_
network
(
logit_
layer
=
self
.
create_
layer
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
,
xformer_stack
=
xformer_stack
,
output
=
'logits'
)
logit_network
.
set_weights
(
test_network
.
get_weights
())
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_
lm_
positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_network
([
lm_input_tensor
,
masked_
lm_
positions
]
)
logit_output
=
logit_network
([
lm_input_tensor
,
masked_lm_positions
]
)
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_
lm_
positions
],
output
)
logits_model
=
tf
.
keras
.
Model
(([
lm_input_tensor
,
masked_
lm_
positions
]),
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
)
logit_
output
=
logit_layer
(
lm_input_tensor
,
masked_positions
)
logit_output
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
logit_output
)
logit_layer
.
set_weights
(
test_layer
.
get_weights
())
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_positions
],
output
)
logits_model
=
tf
.
keras
.
Model
(([
lm_input_tensor
,
masked_positions
]),
logit_output
)
# Invoke the masked LM on some fake data to make sure there are no runtime
...
...
@@ -169,40 +115,33 @@ class MaskedLMTest(keras_parameterized.TestCase):
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
outputs
=
model
.
predict
([
lm_input_data
,
masked_position_data
])
logits
=
logits_model
.
predict
([
lm_input_data
,
masked_position_data
])
sequence_length
,
size
=
(
batch_size
,
num_predictions
))
# ref_outputs = model.predict([lm_input_data, masked_position_data])
# outputs = logits_model.predict([lm_input_data, masked_position_data])
ref_outputs
=
model
([
lm_input_data
,
masked_position_data
])
outputs
=
logits_model
([
lm_input_data
,
masked_position_data
])
# Ensure that the tensor shapes are correct.
expected_output_shape
=
(
batch_size
,
num_predictions
,
vocab_size
)
self
.
assertEqual
(
expected_output_shape
,
ref_outputs
.
shape
)
self
.
assertEqual
(
expected_output_shape
,
outputs
.
shape
)
self
.
assert
Equal
(
expected_output_shape
,
logits
.
shape
)
self
.
assert
AllClose
(
ref_outputs
,
outputs
)
# Ensure that the logits, when softmaxed, create the outputs.
input_tensor
=
tf
.
keras
.
Input
(
expected_output_shape
[
1
:])
output_tensor
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
input_tensor
)
softmax_model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
calculated_softmax
=
softmax_model
.
predict
(
logits
)
self
.
assertAllClose
(
outputs
,
calculated_softmax
)
def
test_network_invocation
(
self
):
def
test_layer_invocation
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
num_predictions
=
21
test_
network
=
self
.
create_
network
(
test_
layer
=
self
.
create_
layer
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
)
hidden_size
=
hidden_size
)
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_lm_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_network
([
lm_input_tensor
,
masked_lm_positions
])
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_lm_positions
],
output
)
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
)
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_positions
],
output
)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
...
...
@@ -215,12 +154,8 @@ class MaskedLMTest(keras_parameterized.TestCase):
def
test_unknown_output_type_fails
(
self
):
with
self
.
assertRaisesRegex
(
ValueError
,
'Unknown `output` value "bad".*'
):
_
=
self
.
create_network
(
vocab_size
=
8
,
sequence_length
=
8
,
hidden_size
=
8
,
num_predictions
=
8
,
output
=
'bad'
)
_
=
self
.
create_layer
(
vocab_size
=
8
,
sequence_length
=
8
,
hidden_size
=
8
,
output
=
'bad'
)
if
__name__
==
'__main__'
:
...
...
official/nlp/
nhnet
/multi_channel_attention.py
→
official/nlp/
modeling/layers
/multi_channel_attention.py
View file @
47bc1813
...
...
@@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Multi-channel decoder."""
"""Multi-channel Attention."""
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
...
...
@@ -24,11 +25,25 @@ import math
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.nlp.modeling
import
layers
class
DocAttention
(
tf
.
keras
.
layers
.
Layer
):
"""Documents Attention layer."""
from
official.nlp.modeling.layers
import
attention
from
official.nlp.modeling.layers
import
dense_einsum
from
official.nlp.modeling.layers
import
masked_softmax
class
VotingAttention
(
tf
.
keras
.
layers
.
Layer
):
"""Voting Attention layer.
Arguments:
num_heads: the number of attention heads.
head_size: per-head hidden size.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
"""
def
__init__
(
self
,
num_heads
,
...
...
@@ -41,7 +56,7 @@ class DocAttention(tf.keras.layers.Layer):
kernel_constraint
=
None
,
bias_constraint
=
None
,
**
kwargs
):
super
(
Doc
Attention
,
self
).
__init__
(
**
kwargs
)
super
(
Voting
Attention
,
self
).
__init__
(
**
kwargs
)
self
.
_num_heads
=
num_heads
self
.
_head_size
=
head_size
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
...
...
@@ -52,7 +67,7 @@ class DocAttention(tf.keras.layers.Layer):
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
def
build
(
self
,
unused_input_shapes
):
self
.
_query_dense
=
layers
.
DenseEinsum
(
self
.
_query_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
(
self
.
_num_heads
,
self
.
_head_size
),
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
...
...
@@ -63,7 +78,7 @@ class DocAttention(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
dtype
=
self
.
dtype
,
name
=
"encdocatt_query"
)
self
.
_key_dense
=
layers
.
DenseEinsum
(
self
.
_key_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
(
self
.
_num_heads
,
self
.
_head_size
),
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
...
...
@@ -74,7 +89,7 @@ class DocAttention(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
dtype
=
self
.
dtype
,
name
=
"encdocatt_key"
)
super
(
Doc
Attention
,
self
).
build
(
unused_input_shapes
)
super
(
Voting
Attention
,
self
).
build
(
unused_input_shapes
)
def
call
(
self
,
encoder_outputs
,
doc_attention_mask
):
num_docs
=
tf_utils
.
get_shape_list
(
encoder_outputs
,
expected_rank
=
[
4
])[
1
]
...
...
@@ -95,12 +110,16 @@ class DocAttention(tf.keras.layers.Layer):
return
tf
.
nn
.
softmax
(
doc_attention_probs
+
infadder
)
class
MultiChannelAttention
(
layers
.
MultiHeadAttention
):
"""Multi-channel Attention layer."""
class
MultiChannelAttention
(
attention
.
MultiHeadAttention
):
"""Multi-channel Attention layer.
Introduced in: https://arxiv.org/abs/2001.09386. Expects multiple
cross-attention target sequences.
"""
def
build
(
self
,
input_shape
):
super
(
MultiChannelAttention
,
self
).
build
(
input_shape
)
self
.
_masked_softmax
=
layers
.
MaskedSoftmax
(
mask_expansion_axes
=
[
2
])
def
_
build
_attention
(
self
,
qkv_rank
):
super
(
MultiChannelAttention
,
self
).
_
build
_attention
(
qkv_rank
)
self
.
_masked_softmax
=
masked_softmax
.
MaskedSoftmax
(
mask_expansion_axes
=
[
2
])
def
call
(
self
,
inputs
,
attention_mask
=
None
):
from_tensor
=
inputs
[
0
]
...
...
official/nlp/
nhnet
/multi_channel_attention_test.py
→
official/nlp/
modeling/layers
/multi_channel_attention_test.py
View file @
47bc1813
...
...
@@ -22,14 +22,15 @@ from __future__ import print_function
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.
nhnet
import
multi_channel_attention
from
official.nlp.
modeling.layers
import
multi_channel_attention
class
MultiChannelAttentionTest
(
tf
.
test
.
TestCase
):
def
test_doc_attention
(
self
):
num_heads
=
2
doc_attention
=
multi_channel_attention
.
DocAttention
(
num_heads
,
head_size
=
8
)
doc_attention
=
multi_channel_attention
.
VotingAttention
(
num_heads
,
head_size
=
8
)
num_docs
=
3
inputs
=
np
.
zeros
((
2
,
num_docs
,
10
,
16
),
dtype
=
np
.
float32
)
doc_mask
=
np
.
zeros
((
2
,
num_docs
),
dtype
=
np
.
float32
)
...
...
official/nlp/modeling/layers/transformer.py
View file @
47bc1813
...
...
@@ -24,6 +24,7 @@ import tensorflow as tf
from
official.nlp.modeling.layers
import
attention
from
official.nlp.modeling.layers
import
dense_einsum
from
official.nlp.modeling.layers
import
multi_channel_attention
from
official.nlp.modeling.layers.util
import
tf_function_if_eager
...
...
@@ -78,6 +79,7 @@ class Transformer(tf.keras.layers.Layer):
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
...
...
@@ -236,3 +238,200 @@ class CompiledTransformer(Transformer):
@
tf_function_if_eager
(
experimental_compile
=
True
)
def
call
(
self
,
inputs
):
return
super
(
CompiledTransformer
,
self
).
call
(
inputs
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
TransformerDecoderLayer
(
tf
.
keras
.
layers
.
Layer
):
"""Single transformer layer for decoder.
It has three sub-layers:
(1) a multi-head self-attention mechanism.
(2) a encoder-decoder attention.
(3) a positionwise fully connected feed-forward network.
Arguments:
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout_rate: Dropout probability for the post-attention and output dropout.
attention_dropout_rate: Dropout probability for within the attention layer.
multi_channel_cross_attention: Whether to use `MultiChannelAttention` for
cross-attention between target sequences and source sequences.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
"""
def
__init__
(
self
,
num_attention_heads
,
intermediate_size
,
intermediate_activation
,
dropout_rate
=
0.0
,
attention_dropout_rate
=
0.0
,
multi_channel_cross_attention
=
False
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activity_regularizer
=
None
,
kernel_constraint
=
None
,
bias_constraint
=
None
,
**
kwargs
):
super
(
TransformerDecoderLayer
,
self
).
__init__
(
**
kwargs
)
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
intermediate_activation
=
tf
.
keras
.
activations
.
get
(
intermediate_activation
)
self
.
dropout_rate
=
dropout_rate
self
.
attention_dropout_rate
=
attention_dropout_rate
self
.
multi_channel_cross_attention
=
multi_channel_cross_attention
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
if
self
.
multi_channel_cross_attention
:
self
.
_cross_attention_cls
=
multi_channel_attention
.
MultiChannelAttention
else
:
self
.
_cross_attention_cls
=
attention
.
MultiHeadAttention
def
build
(
self
,
input_shape
):
target_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
0
])
if
len
(
target_tensor_shape
)
!=
3
:
raise
ValueError
(
"TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width]."
)
hidden_size
=
target_tensor_shape
[
2
]
if
hidden_size
%
self
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
num_attention_heads
))
self
.
attention_head_size
=
int
(
hidden_size
/
self
.
num_attention_heads
)
# Self attention.
self
.
self_attention
=
attention
.
CachedAttention
(
num_heads
=
self
.
num_attention_heads
,
key_size
=
self
.
attention_head_size
,
dropout
=
self
.
attention_dropout_rate
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"self_attention"
)
self
.
self_attention_output_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
hidden_size
,
num_summed_dimensions
=
2
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"self_attention_output"
)
self
.
self_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
self_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
))
# Encoder-decoder attention.
self
.
encdec_attention
=
self
.
_cross_attention_cls
(
num_heads
=
self
.
num_attention_heads
,
key_size
=
self
.
attention_head_size
,
dropout
=
self
.
attention_dropout_rate
,
output_shape
=
hidden_size
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"attention/encdec"
)
self
.
encdec_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
encdec_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"attention/encdec_output_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
))
# Feed-forward projection.
self
.
intermediate_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
self
.
intermediate_size
,
activation
=
None
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"intermediate"
)
self
.
intermediate_activation_layer
=
tf
.
keras
.
layers
.
Activation
(
self
.
intermediate_activation
)
self
.
output_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
hidden_size
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"output"
)
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
)
super
(
TransformerDecoderLayer
,
self
).
build
(
input_shape
)
def
common_layers_with_encoder
(
self
):
"""Gets layer objects that can make a Transformer encoder block."""
return
[
self
.
self_attention
,
self
.
self_attention_layer_norm
,
self
.
intermediate_dense
,
self
.
output_dense
,
self
.
output_layer_norm
]
def
call
(
self
,
inputs
,
cache
=
None
,
decode_loop_step
=
None
):
if
self
.
multi_channel_cross_attention
:
if
len
(
inputs
)
!=
5
:
raise
ValueError
(
"TransformerDecoderLayer must have 5 inputs, when it uses "
"multi_channel_cross_attention. But it got: %d"
%
len
(
inputs
))
elif
len
(
inputs
)
!=
4
:
raise
ValueError
(
"TransformerDecoderLayer must have 4 inputs, but it got: %d"
%
len
(
inputs
))
input_tensor
,
memory
,
attention_mask
,
self_attention_mask
=
inputs
[:
4
]
self_attention_inputs
=
[
input_tensor
,
input_tensor
]
self_attention_output
,
cache
=
self
.
self_attention
(
self_attention_inputs
,
attention_mask
=
self_attention_mask
,
cache
=
cache
,
decode_loop_step
=
decode_loop_step
)
self_attention_output
=
self
.
self_attention_dropout
(
self_attention_output
)
self_attention_output
=
self
.
self_attention_layer_norm
(
input_tensor
+
self_attention_output
)
cross_attn_inputs
=
[
self_attention_output
,
memory
]
if
self
.
multi_channel_cross_attention
:
# Accesses the 5-th input tensor for the doc-attention probabilities.
cross_attn_inputs
.
append
(
inputs
[
-
1
])
attention_output
=
self
.
encdec_attention
(
cross_attn_inputs
,
attention_mask
)
attention_output
=
self
.
encdec_attention_dropout
(
attention_output
)
attention_output
=
self
.
encdec_attention_layer_norm
(
self_attention_output
+
attention_output
)
intermediate_output
=
self
.
intermediate_dense
(
attention_output
)
intermediate_output
=
self
.
intermediate_activation_layer
(
intermediate_output
)
layer_output
=
self
.
output_dense
(
intermediate_output
)
layer_output
=
self
.
output_dropout
(
layer_output
)
layer_output
=
self
.
output_layer_norm
(
layer_output
+
attention_output
)
return
layer_output
,
cache
official/nlp/modeling/layers/transformer_test.py
View file @
47bc1813
...
...
@@ -215,5 +215,39 @@ class TransformerLayerTest(keras_parameterized.TestCase):
self
.
assertAllEqual
([
1
,
input_length
,
width
],
output_data
.
shape
)
def
_create_cache
(
batch_size
,
init_decode_length
,
num_heads
,
head_size
):
return
{
'key'
:
tf
.
zeros
([
batch_size
,
init_decode_length
,
num_heads
,
head_size
],
dtype
=
tf
.
float32
),
'value'
:
tf
.
zeros
([
batch_size
,
init_decode_length
,
num_heads
,
head_size
],
dtype
=
tf
.
float32
)
}
@
keras_parameterized
.
run_all_keras_modes
class
TransformerDecoderLayerTest
(
keras_parameterized
.
TestCase
):
def
test_decoder_block_with_cache
(
self
):
num_attention_heads
=
2
hidden_size
=
16
decoder_block
=
transformer
.
TransformerDecoderLayer
(
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
32
,
intermediate_activation
=
'relu'
,
dropout_rate
=
0.1
,
attention_dropout_rate
=
0.1
)
# Forward path.
dummy_tensor
=
tf
.
zeros
([
2
,
4
,
16
],
dtype
=
tf
.
float32
)
dummy_mask
=
tf
.
zeros
([
2
,
4
,
4
],
dtype
=
tf
.
float32
)
inputs
=
[
dummy_tensor
,
dummy_tensor
,
dummy_mask
,
dummy_mask
]
cache
=
_create_cache
(
2
,
0
,
num_attention_heads
,
hidden_size
//
num_attention_heads
)
output
,
cache
=
decoder_block
(
inputs
,
cache
)
self
.
assertEqual
(
output
.
shape
,
(
2
,
4
,
hidden_size
))
self
.
assertEqual
(
cache
[
'value'
].
shape
,
(
2
,
4
,
2
,
8
))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/losses/README.md
View file @
47bc1813
...
...
@@ -4,6 +4,3 @@ Losses contains common loss computation used in NLP tasks.
*
`weighted_sparse_categorical_crossentropy_loss`
computes per-batch sparse
categorical crossentropy loss.
*
`weighted_sparse_categorical_crossentropy_per_example_loss`
computes
per-example sparse categorical crossentropy loss.
official/nlp/modeling/losses/__init__.py
View file @
47bc1813
...
...
@@ -14,4 +14,3 @@
# ==============================================================================
"""Activations package definition. Subject to change."""
from
official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy
import
loss
as
weighted_sparse_categorical_crossentropy_loss
from
official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy
import
per_example_loss
as
weighted_sparse_categorical_crossentropy_per_example_loss
official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
View file @
47bc1813
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
S
parse categorical cross-entropy losses."""
"""
Weighted s
parse categorical cross-entropy losses."""
from
__future__
import
absolute_import
from
__future__
import
division
...
...
@@ -43,37 +43,7 @@ def _validate_rank(labels, predictions, weights):
"predictions.shape was %s."
)
%
(
labels
.
shape
,
predictions
.
shape
))
def
per_example_loss
(
labels
,
predictions
,
weights
=
None
):
"""Calculate a per-example sparse categorical crossentropy loss.
This loss function assumes that the predictions are post-softmax.
Args:
labels: The labels to evaluate against. Should be a set of integer indices
ranging from 0 to (vocab_size-1).
predictions: The network predictions. Should have softmax already applied.
weights: An optional weight array of the same shape as the 'labels' array.
If None, all examples will be used.
Returns:
A tensor of shape predictions.shape[:-1] containing the per-example
loss.
"""
# When using these functions with the Keras core API, we will need to squeeze
# the labels tensor - Keras adds a spurious inner dimension.
labels
,
predictions
=
_adjust_labels
(
labels
,
predictions
)
_validate_rank
(
labels
,
predictions
,
weights
)
labels_one_hot
=
tf
.
one_hot
(
labels
,
predictions
.
shape
[
-
1
])
labels_one_hot
=
tf
.
cast
(
labels_one_hot
,
predictions
.
dtype
)
per_example_loss_data
=
-
tf
.
reduce_sum
(
predictions
*
labels_one_hot
,
axis
=
[
-
1
])
if
weights
is
not
None
:
weights
=
tf
.
cast
(
weights
,
per_example_loss_data
.
dtype
)
per_example_loss_data
=
weights
*
per_example_loss_data
return
per_example_loss_data
def
loss
(
labels
,
predictions
,
weights
=
None
):
def
loss
(
labels
,
predictions
,
weights
=
None
,
from_logits
=
False
):
"""Calculate a per-batch sparse categorical crossentropy loss.
This loss function assumes that the predictions are post-softmax.
...
...
@@ -83,6 +53,7 @@ def loss(labels, predictions, weights=None):
predictions: The network predictions. Should have softmax already applied.
weights: An optional weight array of the same shape as the 'labels' array.
If None, all examples will be used.
from_logits: Whether the input predictions are logits.
Returns:
A loss scalar.
...
...
@@ -95,12 +66,11 @@ def loss(labels, predictions, weights=None):
labels
,
predictions
=
_adjust_labels
(
labels
,
predictions
)
_validate_rank
(
labels
,
predictions
,
weights
)
per_example_loss_data
=
per_example_loss
(
labels
,
predictions
,
weights
)
example_losses
=
tf
.
keras
.
losses
.
sparse_categorical_crossentropy
(
labels
,
predictions
,
from_logits
=
from_logits
)
if
weights
is
None
:
return
tf
.
reduce_mean
(
per_example_loss_data
)
else
:
numerator
=
tf
.
reduce_sum
(
per_example_loss_data
)
weights
=
tf
.
cast
(
weights
,
predictions
.
dtype
)
denominator
=
tf
.
reduce_sum
(
weights
)
+
1e-5
return
numerator
/
denominator
return
tf
.
reduce_mean
(
example_losses
)
weights
=
tf
.
cast
(
weights
,
predictions
.
dtype
)
return
tf
.
math
.
divide_no_nan
(
tf
.
reduce_sum
(
example_losses
*
weights
),
tf
.
reduce_sum
(
weights
))
official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
View file @
47bc1813
...
...
@@ -23,6 +23,7 @@ import numpy as np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
networks
from
official.nlp.modeling.losses
import
weighted_sparse_categorical_crossentropy
...
...
@@ -48,139 +49,19 @@ class ClassificationLossTest(keras_parameterized.TestCase):
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
lm_outputs
,
_
=
xformer_stack
([
word_ids
,
mask
,
type_ids
])
_
=
xformer_stack
([
word_ids
,
mask
,
type_ids
])
# Create a maskedLM from the transformer stack.
test_network
=
networks
.
MaskedLM
(
num_predictions
=
num_predictions
,
input_width
=
lm_outputs
.
shape
[
-
1
],
source_network
=
xformer_stack
,
output
=
output
)
test_layer
=
layers
.
MaskedLM
(
embedding_table
=
xformer_stack
.
get_embedding_table
(),
output
=
output
)
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_lm_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_
network
([
lm_input_tensor
,
masked_lm_positions
]
)
output
=
test_
layer
(
lm_input_tensor
,
masked_positions
=
masked_lm_positions
)
return
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_lm_positions
],
output
)
def
create_classification_model
(
self
,
input_width
,
num_classes
):
test_object
=
networks
.
Classification
(
input_width
=
input_width
,
num_classes
=
num_classes
)
# Create a 2-dimensional input (the first dimension is implicit).
pooled_data
=
tf
.
keras
.
Input
(
shape
=
(
input_width
,),
dtype
=
tf
.
float32
)
output
=
test_object
(
pooled_data
)
return
tf
.
keras
.
Model
(
pooled_data
,
output
)
def
test_per_example_loss_3d_input
(
self
):
"""Test per-example loss with a 3-dimensional input, from a masked LM."""
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
num_predictions
=
21
model
=
self
.
create_lm_model
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
)
# Get the output of the masked LM.
batch_size
=
3
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
output_data
=
model
.
predict
([
lm_input_data
,
masked_position_data
])
# Calculate per-example loss.
labels
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
num_predictions
))
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
)
# Per-example loss data should have one value per prediction, and those
# values shouldn't be zero in this case (as we're using random data).
expected_shape
=
[
batch_size
,
num_predictions
]
self
.
assertEqual
(
expected_shape
,
per_example_loss_data
.
shape
.
as_list
())
self
.
assertNotAllClose
(
tf
.
zeros_like
(
per_example_loss_data
),
per_example_loss_data
)
def
test_per_example_loss_2d_input
(
self
):
"""Test per-example loss with a 2-d input, from a classifier."""
input_width
=
512
num_classes
=
10
model
=
self
.
create_classification_model
(
input_width
,
num_classes
)
# Invoke the network as part of a Model.
batch_size
=
3
input_data
=
10
*
np
.
random
.
random_sample
((
batch_size
,
input_width
))
output_data
=
model
.
predict
(
input_data
)
# Calculate per example loss.
labels
=
np
.
random
.
randint
(
num_classes
,
size
=
(
batch_size
))
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
)
# Per-example loss data should have one value per batch item, and those
# values shouldn't be zero in this case (as we're using random data).
self
.
assertEqual
([
batch_size
],
per_example_loss_data
.
shape
.
as_list
())
self
.
assertNotAllClose
(
tf
.
zeros_like
(
per_example_loss_data
),
per_example_loss_data
)
def
test_per_example_loss_weights_3d_input
(
self
):
"""Test weighted per-example loss with a 3-d input, from a masked LM."""
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
num_predictions
=
21
model
=
self
.
create_lm_model
(
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
num_predictions
=
num_predictions
)
# Get the output of the masked LM.
batch_size
=
3
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
output_data
=
model
.
predict
([
lm_input_data
,
masked_position_data
])
# Calculate per-example loss with weights.
labels
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
num_predictions
))
weights
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
# Weighted per-example loss data should be equivalent to multiplying the
# loss tensor by the weights tensor.
expected_weighted_loss
=
per_example_loss_data
*
weights
self
.
assertAllClose
(
expected_weighted_loss
,
per_example_loss_data
)
def
test_per_example_loss_weights_2d_input
(
self
):
"""Test weighted per-example loss with a 2-d input, from a classifier."""
input_width
=
512
num_classes
=
10
model
=
self
.
create_classification_model
(
input_width
,
num_classes
)
# Invoke the network as part of a Model.
batch_size
=
3
input_data
=
10
*
np
.
random
.
random_sample
((
batch_size
,
input_width
))
output_data
=
model
.
predict
(
input_data
)
# Calculate per-example loss with weights.
labels
=
np
.
random
.
randint
(
num_classes
,
size
=
(
batch_size
))
weights
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
))
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
# Weighted per-example loss data should be equivalent to multiplying the
# loss tensor by the weights tensor.
expected_weighted_loss
=
per_example_loss_data
*
weights
self
.
assertAllClose
(
expected_weighted_loss
,
per_example_loss_data
)
def
test_loss_3d_input
(
self
):
"""Test overall loss with a 3-dimensional input, from a masked LM."""
vocab_size
=
100
...
...
@@ -214,26 +95,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
self
.
assertNotAllClose
(
tf
.
zeros_like
(
per_example_loss_data
),
per_example_loss_data
)
def
test_loss_2d_input
(
self
):
"""Test overall loss with a 2-d input, from a classifier."""
input_width
=
512
num_classes
=
10
model
=
self
.
create_classification_model
(
input_width
,
num_classes
)
# Invoke the network as part of a Model.
batch_size
=
3
input_data
=
10
*
np
.
random
.
random_sample
((
batch_size
,
input_width
))
output_data
=
model
.
predict
(
input_data
)
# Calculate per example loss.
labels
=
np
.
random
.
randint
(
num_classes
,
size
=
(
batch_size
))
loss_data
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
)
# Loss data should have one value only, and that value shouldn't be zero in
# this case (as we're using random data).
self
.
assertNotAllClose
(
0
,
loss_data
)
def
test_loss_weights_3d_input
(
self
):
"""Test masked loss with a 3-dimensional input, from a masked LM."""
vocab_size
=
100
...
...
@@ -263,26 +124,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
# Because the tensor is fully masked, the loss should be 0.
self
.
assertAllClose
(
0
,
weighted_loss_data
)
def
test_loss_weights_2d_input
(
self
):
"""Test masked loss with a 2-d input, from a classifier."""
input_width
=
512
num_classes
=
10
model
=
self
.
create_classification_model
(
input_width
,
num_classes
)
# Invoke the network as part of a Model.
batch_size
=
3
input_data
=
10
*
np
.
random
.
random_sample
((
batch_size
,
input_width
))
output_data
=
model
.
predict
(
input_data
)
# Calculate a fully masked weight tensor. This should give a loss of zero.
labels
=
np
.
random
.
randint
(
num_classes
,
size
=
(
batch_size
))
null_weights
=
np
.
zeros
((
batch_size
))
weighted_loss_data
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
null_weights
)
# Because the tensor is fully masked, the loss should be 0.
self
.
assertAllClose
(
0
,
weighted_loss_data
)
def
test_mismatched_predictions_and_labels_ranks_squeezes
(
self
):
"""Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
batch_size
=
3
...
...
@@ -290,7 +131,7 @@ class ClassificationLossTest(keras_parameterized.TestCase):
labels
=
np
.
random
.
randint
(
10
,
size
=
(
batch_size
,
1
))
# All that this test tests is that the squeeze is successful.
_
=
weighted_sparse_categorical_crossentropy
.
per_example_
loss
(
_
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
)
def
test_mismatched_weights_and_labels_ranks_fail
(
self
):
...
...
@@ -300,9 +141,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
labels
=
np
.
random
.
randint
(
10
,
size
=
(
batch_size
,
10
))
weights
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
))
with
self
.
assertRaisesRegex
(
RuntimeError
,
".*of the same rank.*"
):
_
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
with
self
.
assertRaisesRegex
(
RuntimeError
,
".*of the same rank.*"
):
_
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
...
...
@@ -318,8 +156,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
# We're not trying to validate numerical correctness, just ensure that
# we can in fact pass tensors to these functions without causing runtime
# errors from the shape checking code.
_
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
_
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
...
...
@@ -339,20 +175,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
[
-
2.7760355
,
-
1.8219438
,
-
3.0924666
,
-
1.0779881
,
-
0.9407509
]]])
labels
=
np
.
array
([[
4
,
0
],
[
2
,
2
],
[
2
,
1
]])
# Validate that per_example loss calculations are the same.
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
)
expected_per_example_loss_data
=
[[
1.2923571
,
2.7117882
],
[
2.287932
,
2.287932
],
[
3.0924666
,
1.8219438
]]
self
.
assertAllClose
(
expected_per_example_loss_data
,
per_example_loss_data
)
# Validate that overall loss calculations are the same.
weights
=
np
.
array
([[
1
,
0
],
[
0
,
0
],
[
0
,
0
]])
loss_data
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
,
from_logits
=
True
)
expected_loss_data
=
1.2923441
self
.
assertAllClose
(
expected_loss_data
,
loss_data
)
self
.
assertAllClose
(
expected_loss_data
,
loss_data
,
rtol
=
1e-3
)
def
test_legacy_classification_loss_compatibility
(
self
):
"""Test to validate computational correctness during refactors."""
...
...
@@ -363,19 +194,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
[
-
1.6975292e-03
,
-
6.4009643e+00
,
-
1.0226612e+01
]])
labels
=
np
.
array
([
2
,
1
])
# Validate that per_example loss calculations are the same.
per_example_loss_data
=
weighted_sparse_categorical_crossentropy
.
per_example_loss
(
predictions
=
output_data
,
labels
=
labels
)
expected_per_example_loss_data
=
[
6.4434357
,
6.4009643
]
self
.
assertAllClose
(
expected_per_example_loss_data
,
per_example_loss_data
)
# Validate that overall loss calculations are the same.
weights
=
None
loss_data
=
weighted_sparse_categorical_crossentropy
.
loss
(
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
)
predictions
=
output_data
,
labels
=
labels
,
weights
=
weights
,
from_logits
=
True
)
expected_loss_data
=
6.4222
self
.
assertAllClose
(
expected_loss_data
,
loss_data
)
self
.
assertAllClose
(
expected_loss_data
,
loss_data
,
rtol
=
1e-3
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/modeling/models/bert_pretrainer.py
View file @
47bc1813
...
...
@@ -25,6 +25,7 @@ from typing import List, Optional
import
gin
import
tensorflow
as
tf
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
networks
...
...
@@ -47,8 +48,8 @@ class BertPretrainer(tf.keras.Model):
num_token_predictions: Number of tokens to predict from the masked LM.
embedding_table: Embedding table of a network. If None, the
"network.get_embedding_table()" is used.
activation: The activation (if any) to use in the masked LM network.
If
None, no activation will be used.
activation: The activation (if any) to use in the masked LM network.
If
None, no activation will be used.
initializer: The initializer (if any) to use in the masked LM and
classification networks. Defaults to a Glorot uniform initializer.
output: The output style for this network. Can be either 'logits' or
...
...
@@ -106,16 +107,16 @@ class BertPretrainer(tf.keras.Model):
dtype
=
tf
.
int32
)
inputs
.
append
(
masked_lm_positions
)
self
.
masked_lm
=
networks
.
MaskedLM
(
num_predictions
=
num_token_predictions
,
input_width
=
sequence_output
.
shape
[
-
1
],
source_network
=
network
,
if
embedding_table
is
None
:
embedding_table
=
self
.
encoder
.
get_embedding_table
()
self
.
masked_lm
=
layers
.
MaskedLM
(
embedding_table
=
embedding_table
,
activation
=
activation
,
initializer
=
initializer
,
output
=
output
,
name
=
'masked_lm'
)
lm_outputs
=
self
.
masked_lm
([
sequence_output
,
masked_lm_positions
])
name
=
'cls/predictions'
)
lm_outputs
=
self
.
masked_lm
(
sequence_output
,
masked_positions
=
masked_lm_positions
)
self
.
classification
=
networks
.
Classification
(
input_width
=
cls_output
.
shape
[
-
1
],
...
...
@@ -126,7 +127,9 @@ class BertPretrainer(tf.keras.Model):
sentence_outputs
=
self
.
classification
(
cls_output
)
super
(
BertPretrainer
,
self
).
__init__
(
inputs
=
inputs
,
outputs
=
[
lm_outputs
,
sentence_outputs
],
**
kwargs
)
inputs
=
inputs
,
outputs
=
dict
(
masked_lm
=
lm_outputs
,
classification
=
sentence_outputs
),
**
kwargs
)
def
get_config
(
self
):
return
self
.
_config
...
...
@@ -151,8 +154,8 @@ class BertPretrainerV2(tf.keras.Model):
num_masked_tokens: Number of tokens to predict from the masked LM.
encoder_network: A transformer network. This network should output a
sequence output and a classification output.
mlm_activation: The activation (if any) to use in the masked LM network.
If
None, no activation will be used.
mlm_activation: The activation (if any) to use in the masked LM network.
If
None, no activation will be used.
mlm_initializer: The initializer (if any) to use in the masked LM. Default
to a Glorot uniform initializer.
classification_heads: A list of optional head layers to transform on encoder
...
...
@@ -193,17 +196,18 @@ class BertPretrainerV2(tf.keras.Model):
outputs
=
dict
()
if
num_masked_tokens
>
0
:
self
.
masked_lm
=
networks
.
MaskedLM
(
num_predictions
=
num_masked_tokens
,
input_width
=
sequence_output
.
shape
[
-
1
],
source_network
=
self
.
encoder_network
,
self
.
masked_lm
=
layers
.
MaskedLM
(
embedding_table
=
self
.
encoder_network
.
get_embedding_table
(),
activation
=
mlm_activation
,
initializer
=
mlm_initializer
,
name
=
'masked_lm'
)
masked_lm_positions
=
copy
.
copy
(
self
.
masked_lm
.
inputs
[
-
1
])
name
=
'cls/predictions'
)
masked_lm_positions
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
num_masked_tokens
,),
name
=
'masked_lm_positions'
,
dtype
=
tf
.
int32
)
inputs
.
append
(
masked_lm_positions
)
outputs
[
'lm_output'
]
=
self
.
masked_lm
(
[
sequence_output
,
masked_lm_positions
]
)
sequence_output
,
masked_positions
=
masked_lm_positions
)
for
cls_head
in
self
.
classification_heads
:
outputs
[
cls_head
.
name
]
=
cls_head
(
sequence_output
)
...
...
Prev
1
2
3
4
5
6
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment