"...resnet50_tensorflow.git" did not exist on "3e967b826d73026eeaa47ba7dd0c53705e3b0857"
Commit 78c43ef1 authored by Gunho Park's avatar Gunho Park
Browse files

Merge branch 'master' of https://github.com/tensorflow/models

parents 67cfc95b e3c7e300
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Adafactor optimizer.
A new optimizer that will be open sourced soon.
"""
# pylint: disable=invalid-name, represents an unimplemented class definition.
Adafactor = "Unimplemented"
...@@ -56,10 +56,12 @@ class StepwiseLrConfig(base_config.Config): ...@@ -56,10 +56,12 @@ class StepwiseLrConfig(base_config.Config):
values[0] [boundaries[0], boundaries[1]] -> values[1] values[0] [boundaries[0], boundaries[1]] -> values[1]
[boundaries[n-1], boundaries[n]] -> values[n] [boundaries[n], [boundaries[n-1], boundaries[n]] -> values[n] [boundaries[n],
end] -> values[n+1] Defaults to None. end] -> values[n+1] Defaults to None.
offset: An int. The offset applied to steps. Defaults to 0.
""" """
name: str = 'PiecewiseConstantDecay' name: str = 'PiecewiseConstantDecay'
boundaries: Optional[List[int]] = None boundaries: Optional[List[int]] = None
values: Optional[List[float]] = None values: Optional[List[float]] = None
offset: int = 0
@dataclasses.dataclass @dataclasses.dataclass
...@@ -76,12 +78,14 @@ class ExponentialLrConfig(base_config.Config): ...@@ -76,12 +78,14 @@ class ExponentialLrConfig(base_config.Config):
decay_rate: A float. Defaults to None. decay_rate: A float. Defaults to None.
staircase: A boolean, if true, learning rate is decreased at discreate staircase: A boolean, if true, learning rate is decreased at discreate
intervals. Defaults to False. intervals. Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
""" """
name: str = 'ExponentialDecay' name: str = 'ExponentialDecay'
initial_learning_rate: Optional[float] = None initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None decay_steps: Optional[int] = None
decay_rate: Optional[float] = None decay_rate: Optional[float] = None
staircase: Optional[bool] = None staircase: Optional[bool] = None
offset: int = 0
@dataclasses.dataclass @dataclasses.dataclass
...@@ -99,6 +103,7 @@ class PolynomialLrConfig(base_config.Config): ...@@ -99,6 +103,7 @@ class PolynomialLrConfig(base_config.Config):
power: A float. The power of the polynomial. Defaults to linear, 1.0. power: A float. The power of the polynomial. Defaults to linear, 1.0.
cycle: A boolean, whether or not it should cycle beyond decay_steps. cycle: A boolean, whether or not it should cycle beyond decay_steps.
Defaults to False. Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
""" """
name: str = 'PolynomialDecay' name: str = 'PolynomialDecay'
initial_learning_rate: Optional[float] = None initial_learning_rate: Optional[float] = None
...@@ -106,6 +111,7 @@ class PolynomialLrConfig(base_config.Config): ...@@ -106,6 +111,7 @@ class PolynomialLrConfig(base_config.Config):
end_learning_rate: float = 0.0001 end_learning_rate: float = 0.0001
power: float = 1.0 power: float = 1.0
cycle: bool = False cycle: bool = False
offset: int = 0
@dataclasses.dataclass @dataclasses.dataclass
...@@ -122,11 +128,13 @@ class CosineLrConfig(base_config.Config): ...@@ -122,11 +128,13 @@ class CosineLrConfig(base_config.Config):
to None. to None.
alpha: A float. Minimum learning rate value as a fraction of alpha: A float. Minimum learning rate value as a fraction of
initial_learning_rate. initial_learning_rate.
offset: An int. The offset applied to steps. Defaults to 0.
""" """
name: str = 'CosineDecay' name: str = 'CosineDecay'
initial_learning_rate: Optional[float] = None initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None decay_steps: Optional[int] = None
alpha: float = 0.0 alpha: float = 0.0
offset: int = 0
@dataclasses.dataclass @dataclasses.dataclass
......
...@@ -52,6 +52,7 @@ class OptimizerConfig(oneof.OneOfConfig): ...@@ -52,6 +52,7 @@ class OptimizerConfig(oneof.OneOfConfig):
lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig() lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig() adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig() slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()
@dataclasses.dataclass @dataclasses.dataclass
......
...@@ -247,3 +247,22 @@ class SLIDEConfig(BaseOptimizerConfig): ...@@ -247,3 +247,22 @@ class SLIDEConfig(BaseOptimizerConfig):
do_gradient_rescaling: bool = True do_gradient_rescaling: bool = True
norm_type: str = "layer" norm_type: str = "layer"
ratio_clip_norm: float = 1e5 ratio_clip_norm: float = 1e5
@dataclasses.dataclass
class AdafactorConfig(BaseOptimizerConfig):
"""Configuration for Adafactor optimizer.
The attributes for this class matches the arguments of the Adafactor
implementation.
"""
name: str = "Adafactor"
factored: bool = True
multiply_by_parameter_scale: bool = True
beta1: Optional[float] = None
decay_rate: float = 0.8
step_offset: int = 0
clipping_threshold: float = 1.0
min_dim_size_to_factor: int = 128
epsilon1: float = 1e-30
epsilon2: float = 1e-3
...@@ -19,6 +19,75 @@ from typing import Mapping, Any, Union, Optional ...@@ -19,6 +19,75 @@ from typing import Mapping, Any, Union, Optional
import tensorflow as tf import tensorflow as tf
def _make_offset_wrapper(new_class_name: str, base_lr_class):
"""Generates a offset wrapper of learning rate schedule.
It will returns a subclass of the the `base_lr_class`, the subclass takes an
`offset` argument in the constructor. When the new class instance is called,
the behavior is:
new_class_object(step) = base_lr_class_object(step - offset)
Example:
CosineDecayWithOffset = _make_offset_wrapper(
'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
# Use the lr:
lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
decay_steps=1000)
lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
Args:
new_class_name: the name of the new class.
base_lr_class: the base learning rate schedule class. Should be subclass of
tf.keras.optimizers.schedules.LearningRateSchedule
Returns:
A new class (subclass of the base_lr_class) that can take an offset.
"""
assert issubclass(base_lr_class,
tf.keras.optimizers.schedules.LearningRateSchedule), (
"base_lr_class should be subclass of keras "
f"LearningRateSchedule, got {base_lr_class}")
# pylint: disable=protected-access,pointless-statement
def offset_learning_rate_init(self, offset=0, **kwargs):
"""Construct learning rate schedule object.
When this object is called, its behavior is
self.__call__(step) == base_lr_class.__call__(step - offset)
Args:
self: this object.
offset: The offset when computing the learning rate schedule.
**kwargs: Pass through to base learning rate class constructor.
"""
base_lr_class.__init__(self, **kwargs)
self._offset = offset
def offset_learning_rate_call(self, step):
step = tf.cast(step - self._offset, tf.float32)
return base_lr_class.__call__(self, step)
# pylint: enable=protected-access,pointless-statement
return type(
new_class_name, (base_lr_class,), {
"base_lr_class": base_lr_class,
"__init__": offset_learning_rate_init,
"__call__": offset_learning_rate_call
})
PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
"PiecewiseConstantDecayWithOffset",
tf.keras.optimizers.schedules.PiecewiseConstantDecay)
PolynomialDecayWithOffset = _make_offset_wrapper(
"PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
ExponentialDecayWithOffset = _make_offset_wrapper(
"ExponentialDecayWithOffset",
tf.keras.optimizers.schedules.ExponentialDecay)
CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
tf.keras.experimental.CosineDecay)
class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule): class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Linear warmup schedule.""" """Linear warmup schedule."""
......
...@@ -70,5 +70,40 @@ class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase): ...@@ -70,5 +70,40 @@ class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
self.assertAlmostEqual(lr(step).numpy(), value) self.assertAlmostEqual(lr(step).numpy(), value)
class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
dict(class_name=lr_schedule.PolynomialDecayWithOffset),
dict(class_name=lr_schedule.ExponentialDecayWithOffset),
dict(class_name=lr_schedule.CosineDecayWithOffset),
)
def test_generated_docstring(self, class_name):
self.assertNotEmpty(class_name.__init__.__doc__)
@parameterized.parameters(
dict(
class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
dict(
class_name=lr_schedule.PolynomialDecayWithOffset,
kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
dict(
class_name=lr_schedule.ExponentialDecayWithOffset,
kwarg=dict(
initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
dict(
class_name=lr_schedule.CosineDecayWithOffset,
kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
)
def test_offset(self, class_name, kwarg):
offset = 10
offset_lr = class_name(offset=offset, **kwarg)
base_lr = class_name.base_lr_class(**kwarg)
self.assertIsInstance(offset_lr, class_name)
for step in range(10, 101, 10):
self.assertEqual(offset_lr(step), base_lr(step - offset))
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
...@@ -20,6 +20,7 @@ import tensorflow as tf ...@@ -20,6 +20,7 @@ import tensorflow as tf
import tensorflow_addons.optimizers as tfa_optimizers import tensorflow_addons.optimizers as tfa_optimizers
from official.modeling.optimization import slide_optimizer from official.modeling.optimization import slide_optimizer
from official.modeling.optimization import adafactor_optimizer
from official.modeling.optimization import ema_optimizer from official.modeling.optimization import ema_optimizer
from official.modeling.optimization import lars_optimizer from official.modeling.optimization import lars_optimizer
from official.modeling.optimization import lr_schedule from official.modeling.optimization import lr_schedule
...@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = { ...@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = {
'rmsprop': tf.keras.optimizers.RMSprop, 'rmsprop': tf.keras.optimizers.RMSprop,
'lars': lars_optimizer.LARS, 'lars': lars_optimizer.LARS,
'adagrad': tf.keras.optimizers.Adagrad, 'adagrad': tf.keras.optimizers.Adagrad,
'slide': slide_optimizer.SLIDE 'slide': slide_optimizer.SLIDE,
'adafactor': adafactor_optimizer.Adafactor,
} }
LR_CLS = { LR_CLS = {
'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay, 'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
'polynomial': tf.keras.optimizers.schedules.PolynomialDecay, 'polynomial': lr_schedule.PolynomialDecayWithOffset,
'exponential': tf.keras.optimizers.schedules.ExponentialDecay, 'exponential': lr_schedule.ExponentialDecayWithOffset,
'cosine': tf.keras.experimental.CosineDecay, 'cosine': lr_schedule.CosineDecayWithOffset,
'power': lr_schedule.DirectPowerDecay, 'power': lr_schedule.DirectPowerDecay,
'power_linear': lr_schedule.PowerAndLinearDecay, 'power_linear': lr_schedule.PowerAndLinearDecay,
'power_with_offset': lr_schedule.PowerDecayWithOffset, 'power_with_offset': lr_schedule.PowerDecayWithOffset,
......
...@@ -14,29 +14,16 @@ ...@@ -14,29 +14,16 @@
"""Functions and classes related to training performance.""" """Functions and classes related to training performance."""
from absl import logging
import tensorflow as tf import tensorflow as tf
def configure_optimizer(optimizer, def configure_optimizer(optimizer,
use_float16=False, use_float16=False,
use_graph_rewrite=False, use_graph_rewrite=False,
loss_scale='dynamic', loss_scale=None):
use_experimental_api=False):
"""Configures optimizer object with performance options.""" """Configures optimizer object with performance options."""
if use_experimental_api:
logging.warning('Passing use_experimental_api=True is deprecated. The '
'argument will be removed in the future.')
if use_float16: if use_float16:
# TODO(b/171936854): Move all methods to non-experimental api. if loss_scale in (None, 'dynamic'):
if use_experimental_api:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
optimizer, loss_scale=loss_scale))
elif loss_scale == 'dynamic':
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
else: else:
# loss_scale is a number. We interpret that as a fixed loss scale. # loss_scale is a number. We interpret that as a fixed loss scale.
...@@ -52,34 +39,17 @@ def configure_optimizer(optimizer, ...@@ -52,34 +39,17 @@ def configure_optimizer(optimizer,
return optimizer return optimizer
def set_mixed_precision_policy(dtype, loss_scale=None, def set_mixed_precision_policy(dtype, loss_scale=None):
use_experimental_api=False): """Sets the global `tf.keras.mixed_precision.Policy`."""
"""Sets mix precision policy.""" # TODO(b/191894773): Remove loss_scale argument
if use_experimental_api: assert loss_scale is None, (
logging.warning('Passing use_experimental_api=True is deprecated. The ' 'The loss_scale argument must be None. The argument exists for '
'argument will be removed in the future.') 'historical reasons and will be removed soon.')
assert use_experimental_api or loss_scale is None, (
'loss_scale cannot be specified if use_experimental_api is False. If the '
'non-experimental API is used, specify the loss scaling configuration '
'when creating the LossScaleOptimizer instead.'
)
if dtype == tf.float16: if dtype == tf.float16:
# TODO(b/171936854): Move all methods to non-experimental api. tf.keras.mixed_precision.set_global_policy('mixed_float16')
if use_experimental_api:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
tf.keras.mixed_precision.experimental.set_policy(policy)
else:
tf.keras.mixed_precision.set_global_policy('mixed_float16')
elif dtype == tf.bfloat16: elif dtype == tf.bfloat16:
if use_experimental_api: tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
else:
tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
elif dtype == tf.float32: elif dtype == tf.float32:
if use_experimental_api: tf.keras.mixed_precision.set_global_policy('float32')
tf.keras.mixed_precision.experimental.set_policy('float32')
else:
tf.keras.mixed_precision.set_global_policy('float32')
else: else:
raise ValueError('Unexpected dtype: %s' % dtype) raise ValueError('Unexpected dtype: %s' % dtype)
...@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False): ...@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False):
"linear": "linear", "linear": "linear",
"identity": "linear", "identity": "linear",
"swish": "swish", "swish": "swish",
"sigmoid": "sigmoid",
"relu6": tf.nn.relu6, "relu6": tf.nn.relu6,
} }
if identifier in keras_layer_allowlist: if identifier in keras_layer_allowlist:
......
...@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config): ...@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config):
embedding_size: Optional[int] = None embedding_size: Optional[int] = None
output_range: Optional[int] = None output_range: Optional[int] = None
return_all_encoder_outputs: bool = False return_all_encoder_outputs: bool = False
# Pre/Post-LN Transformer
norm_first: bool = False
@dataclasses.dataclass @dataclasses.dataclass
...@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config): ...@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config):
intermediate_size: int = 3072 intermediate_size: int = 3072
dropout_rate: float = 0.1 dropout_rate: float = 0.1
attention_dropout_rate: float = 0.1 attention_dropout_rate: float = 0.1
# Pre/Post-LN Transformer
norm_first: bool = False
max_position_embeddings: int = 4096 max_position_embeddings: int = 4096
num_rand_blocks: int = 3 num_rand_blocks: int = 3
block_size: int = 64 block_size: int = 64
...@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config): ...@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config):
intermediate_size: int = 3072 intermediate_size: int = 3072
dropout_rate: float = 0.1 dropout_rate: float = 0.1
attention_dropout_rate: float = 0.1 attention_dropout_rate: float = 0.1
# Pre/Post-LN Transformer
norm_first: bool = False
max_position_embeddings: int = 512 max_position_embeddings: int = 512
type_vocab_size: int = 2 type_vocab_size: int = 2
initializer_range: float = 0.02 initializer_range: float = 0.02
...@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config): ...@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config):
redraw: bool = False redraw: bool = False
is_short_seq: bool = False is_short_seq: bool = False
begin_kernel: int = 0 begin_kernel: int = 0
scale: Optional[float] = None
@dataclasses.dataclass @dataclasses.dataclass
...@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig, ...@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg.hidden_activation), encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate, dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate, attention_dropout_rate=encoder_cfg.attention_dropout_rate,
norm_first=encoder_cfg.norm_first,
kernel_initializer=tf.keras.initializers.TruncatedNormal( kernel_initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range), stddev=encoder_cfg.initializer_range),
attention_cls=layers.BigBirdAttention, attention_cls=layers.BigBirdAttention,
...@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig, ...@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig,
redraw=encoder_cfg.redraw, redraw=encoder_cfg.redraw,
is_short_seq=encoder_cfg.is_short_seq, is_short_seq=encoder_cfg.is_short_seq,
begin_kernel=encoder_cfg.begin_kernel, begin_kernel=encoder_cfg.begin_kernel,
scale=encoder_cfg.scale,
) )
hidden_cfg = dict( hidden_cfg = dict(
num_attention_heads=encoder_cfg.num_attention_heads, num_attention_heads=encoder_cfg.num_attention_heads,
...@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig, ...@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg.hidden_activation), encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate, dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate, attention_dropout_rate=encoder_cfg.attention_dropout_rate,
norm_first=encoder_cfg.norm_first,
kernel_initializer=tf.keras.initializers.TruncatedNormal( kernel_initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range), stddev=encoder_cfg.initializer_range),
attention_cls=layers.KernelAttention, attention_cls=layers.KernelAttention,
...@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig, ...@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig,
embedding_width=encoder_cfg.embedding_size, embedding_width=encoder_cfg.embedding_size,
embedding_layer=embedding_layer, embedding_layer=embedding_layer,
return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs, return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
dict_outputs=True) dict_outputs=True,
norm_first=encoder_cfg.norm_first)
...@@ -28,7 +28,6 @@ from official.core import train_lib ...@@ -28,7 +28,6 @@ from official.core import train_lib
from official.core import train_utils from official.core import train_utils
from official.modeling import performance from official.modeling import performance
from official.modeling.multitask import configs from official.modeling.multitask import configs
from official.modeling.multitask import multitask
from official.modeling.multitask import train_lib as multitask_train_lib from official.modeling.multitask import train_lib as multitask_train_lib
...@@ -167,7 +166,10 @@ def run_continuous_finetune( ...@@ -167,7 +166,10 @@ def run_continuous_finetune(
with distribution_strategy.scope(): with distribution_strategy.scope():
if isinstance(params, configs.MultiEvalExperimentConfig): if isinstance(params, configs.MultiEvalExperimentConfig):
task = task_factory.get_task(params_replaced.task) task = task_factory.get_task(params_replaced.task)
eval_tasks = multitask.MultiTask.from_config(params_replaced.eval_tasks) eval_tasks = [
task_factory.get_task(config.task_config, name=config.task_name)
for config in params.eval_tasks
]
(_, (_,
eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval( eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval(
distribution_strategy=distribution_strategy, distribution_strategy=distribution_strategy,
......
This diff is collapsed.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
import os
import tempfile
from absl.testing import parameterized
import tensorflow as tf
import tensorflow_datasets as tfds
from official.nlp.bert import tokenization
from official.nlp.data import classifier_data_lib
def decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
return tf.io.parse_single_example(record, name_to_features)
class BertClassifierLibTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(BertClassifierLibTest, self).setUp()
self.model_dir = self.get_temp_dir()
self.processors = {
"CB": classifier_data_lib.CBProcessor,
"SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
"BOOLQ": classifier_data_lib.BoolQProcessor,
"WIC": classifier_data_lib.WiCProcessor,
}
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens
]).encode("utf-8"))
vocab_file = vocab_writer.name
self.tokenizer = tokenization.FullTokenizer(vocab_file)
@parameterized.parameters(
{"task_type": "CB"},
{"task_type": "BOOLQ"},
{"task_type": "SUPERGLUE-RTE"},
{"task_type": "WIC"},
)
def test_generate_dataset_from_tfds_processor(self, task_type):
with tfds.testing.mock_data(num_examples=5):
output_path = os.path.join(self.model_dir, task_type)
processor = self.processors[task_type]()
classifier_data_lib.generate_tf_record_from_data_file(
processor,
None,
self.tokenizer,
train_data_output_path=output_path,
eval_data_output_path=output_path,
test_data_output_path=output_path)
files = tf.io.gfile.glob(output_path)
self.assertNotEmpty(files)
train_dataset = tf.data.TFRecordDataset(output_path)
seq_length = 128
label_type = tf.int64
name_to_features = {
"input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.io.FixedLenFeature([], label_type),
}
train_dataset = train_dataset.map(
lambda record: decode_record(record, name_to_features))
# If data is retrieved without error, then all requirements
# including data type/shapes are met.
_ = next(iter(train_dataset))
if __name__ == "__main__":
tf.test.main()
...@@ -50,7 +50,7 @@ flags.DEFINE_enum( ...@@ -50,7 +50,7 @@ flags.DEFINE_enum(
"classification_task_name", "MNLI", [ "classification_task_name", "MNLI", [
"AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE", "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
"SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X", "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
"AX-g", "SUPERGLUE-RTE", "CB", "BoolQ" "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC"
], "The name of the task to train BERT classifier. The " ], "The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format " "difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english " "of input tsv files; 2. the dev set for XTREME is english "
...@@ -173,8 +173,26 @@ flags.DEFINE_string( ...@@ -173,8 +173,26 @@ flags.DEFINE_string(
def generate_classifier_dataset(): def generate_classifier_dataset():
"""Generates classifier dataset and returns input meta data.""" """Generates classifier dataset and returns input meta data."""
assert (FLAGS.input_data_dir and FLAGS.classification_task_name or if FLAGS.classification_task_name in [
FLAGS.tfds_params) "COLA",
"WNLI",
"SST-2",
"MRPC",
"QQP",
"STS-B",
"MNLI",
"QNLI",
"RTE",
"AX",
"SUPERGLUE-RTE",
"CB",
"BoolQ",
"WIC",
]:
assert not FLAGS.input_data_dir or FLAGS.tfds_params
else:
assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
FLAGS.tfds_params)
if FLAGS.tokenization == "WordPiece": if FLAGS.tokenization == "WordPiece":
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization.FullTokenizer(
...@@ -248,6 +266,8 @@ def generate_classifier_dataset(): ...@@ -248,6 +266,8 @@ def generate_classifier_dataset():
classifier_data_lib.CBProcessor, classifier_data_lib.CBProcessor,
"boolq": "boolq":
classifier_data_lib.BoolQProcessor, classifier_data_lib.BoolQProcessor,
"wic":
classifier_data_lib.WnliProcessor,
} }
task_name = FLAGS.classification_task_name.lower() task_name = FLAGS.classification_task_name.lower()
if task_name not in processors: if task_name not in processors:
......
...@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader): ...@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
else: else:
self._label_name_mapping = dict() self._label_name_mapping = dict()
def _decode(self, record: tf.Tensor): def name_to_features_spec(self):
"""Decodes a serialized tf.Example.""" """Defines features to decode. Subclass may override to append features."""
label_type = LABEL_TYPES_MAP[self._params.label_type] label_type = LABEL_TYPES_MAP[self._params.label_type]
name_to_features = { name_to_features = {
'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64), 'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
...@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader): ...@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
if self._include_example_id: if self._include_example_id:
name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64) name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features) return name_to_features
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
example = tf.io.parse_single_example(record, self.name_to_features_spec())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32. # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32. # So cast all int64 to int32.
...@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader): ...@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
def _parse(self, record: Mapping[str, tf.Tensor]): def _parse(self, record: Mapping[str, tf.Tensor]):
"""Parses raw tensors into a dict of tensors to be consumed by the model.""" """Parses raw tensors into a dict of tensors to be consumed by the model."""
x = { key_mapping = {
'input_word_ids': record['input_ids'], 'input_ids': 'input_word_ids',
'input_mask': record['input_mask'], 'input_mask': 'input_mask',
'input_type_ids': record['segment_ids'] 'segment_ids': 'input_type_ids'
} }
if self._include_example_id: ret = {}
x['example_id'] = record['example_id'] for record_key in record:
if record_key in key_mapping:
x[self._label_field] = record[self._label_field] ret[key_mapping[record_key]] = record[record_key]
else:
ret[record_key] = record[record_key]
if self._label_field in self._label_name_mapping: if self._label_field in self._label_name_mapping:
x[self._label_name_mapping[self._label_field]] = record[self._label_field] ret[self._label_name_mapping[self._label_field]] = record[
self._label_field]
return x return ret
def load(self, input_context: Optional[tf.distribute.InputContext] = None): def load(self, input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a tf.dataset.Dataset.""" """Returns a tf.dataset.Dataset."""
...@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader): ...@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
"""Berts preprocess.""" """Berts preprocess."""
segments = [record[x] for x in self._text_fields] segments = [record[x] for x in self._text_fields]
model_inputs = self._text_processor(segments) model_inputs = self._text_processor(segments)
if self._include_example_id: for key in record:
model_inputs['example_id'] = record['example_id'] if key not in self._text_fields:
model_inputs[self._label_field] = record[self._label_field] model_inputs[key] = record[key]
return model_inputs return model_inputs
def _decode(self, record: tf.Tensor): def name_to_features_spec(self):
"""Decodes a serialized tf.Example."""
name_to_features = {} name_to_features = {}
for text_field in self._text_fields: for text_field in self._text_fields:
name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string) name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
...@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader): ...@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type) name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
if self._include_example_id: if self._include_example_id:
name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64) name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features) return name_to_features
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
example = tf.io.parse_single_example(record, self.name_to_features_spec())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32. # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32. # So cast all int64 to int32.
for name in example: for name in example:
......
...@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase, ...@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load() dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset)) features = next(iter(dataset))
label_field = data_config.label_field label_field = data_config.label_field
self.assertCountEqual( expected_keys = [
['input_word_ids', 'input_type_ids', 'input_mask', label_field], 'input_word_ids', 'input_type_ids', 'input_mask', label_field
features.keys()) ]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
...@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase, ...@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load() dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset)) features = next(iter(dataset))
label_field = data_config.label_field label_field = data_config.label_field
self.assertCountEqual( expected_keys = [
['input_word_ids', 'input_type_ids', 'input_mask', label_field], 'input_word_ids', 'input_type_ids', 'input_mask', label_field
features.keys()) ]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
...@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase, ...@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load() dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset)) features = next(iter(dataset))
label_field = data_config.label_field label_field = data_config.label_field
self.assertCountEqual( expected_keys = [
['input_word_ids', 'input_type_ids', 'input_mask', label_field], 'input_word_ids', 'input_type_ids', 'input_mask', label_field
features.keys()) ]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
......
...@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model): ...@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model):
smaller than 'hidden_size'). smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs. generate embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
""" """
def __init__( def __init__(
...@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model): ...@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model):
output_range=None, output_range=None,
embedding_width=None, embedding_width=None,
embedding_layer=None, embedding_layer=None,
norm_first=False,
**kwargs): **kwargs):
activation = tf.keras.activations.get(inner_activation) activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer) initializer = tf.keras.initializers.get(initializer)
...@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model): ...@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model):
inner_activation=inner_activation, inner_activation=inner_activation,
output_dropout=output_dropout, output_dropout=output_dropout,
attention_dropout=attention_dropout, attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=transformer_output_range, output_range=transformer_output_range,
kernel_initializer=initializer, kernel_initializer=initializer,
name='transformer/layer_%d' % i) name='transformer/layer_%d' % i)
...@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model): ...@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model):
'output_range': output_range, 'output_range': output_range,
'embedding_width': embedding_width, 'embedding_width': embedding_width,
'embedding_layer': embedding_layer, 'embedding_layer': embedding_layer,
'norm_first': norm_first,
} }
# We are storing the config dict as a namedtuple here to ensure checkpoint # We are storing the config dict as a namedtuple here to ensure checkpoint
......
...@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase): ...@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
initializer="glorot_uniform", initializer="glorot_uniform",
output_range=-1, output_range=-1,
embedding_width=16, embedding_width=16,
embedding_layer=None) embedding_layer=None,
norm_first=False)
network = bert_encoder.BertEncoder(**kwargs) network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs) expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize( expected_config["inner_activation"] = tf.keras.activations.serialize(
......
...@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase): ...@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
test_layer = position_embedding.PositionEmbedding( test_layer = position_embedding.PositionEmbedding(
max_length=sequence_length, seq_axis=2) max_length=sequence_length, seq_axis=2)
width = 30 width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width, width)) input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
output_tensor = test_layer(input_tensor) output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected # When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch. # to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width, width] expected_output_shape = [None, width, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list()) self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32. # The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float32, output_tensor.dtype) self.assertEqual(tf.float32, output_tensor.dtype)
......
...@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer): ...@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
attention. attention.
Returns: Returns:
An ouput tensor with the same dimensions as input/query tensor. An output tensor with the same dimensions as input/query tensor.
""" """
if isinstance(inputs, (list, tuple)): if isinstance(inputs, (list, tuple)):
if len(inputs) == 2: if len(inputs) == 2:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment