Commit 32e4ca51 authored by qianyj's avatar qianyj
Browse files

Update code to v2.11.0

parents 9485aa1d 71060f67
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Adam optimizer with weight decay that exactly matches the original BERT."""
import re
from absl import logging
import tensorflow as tf
class AdamWeightDecay(tf.keras.optimizers.legacy.Adam):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
[Warning!]: Keras optimizer supports gradient clipping and has an AdamW
implementation. Please consider evaluating the choice in Keras package.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want to decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
"""
def __init__(self,
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
weight_decay_rate=0.0,
include_in_weight_decay=None,
exclude_from_weight_decay=None,
gradient_clip_norm=1.0,
name='AdamWeightDecay',
**kwargs):
super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self.gradient_clip_norm = gradient_clip_norm
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
logging.info('AdamWeightDecay gradient_clip_norm=%f', gradient_clip_norm)
def _prepare_local(self, var_device, var_dtype, apply_state):
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, # pytype: disable=attribute-error # typed-keras
apply_state)
apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
self.weight_decay_rate, name='adam_weight_decay_rate')
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var *
apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
use_locking=self._use_locking)
return tf.no_op()
def apply_gradients(self,
grads_and_vars,
name=None,
experimental_aggregate_gradients=True):
grads, tvars = list(zip(*grads_and_vars))
if experimental_aggregate_gradients and self.gradient_clip_norm > 0.0:
# when experimental_aggregate_gradients = False, apply_gradients() no
# longer implicitly allreduce gradients, users manually allreduce gradient
# and passed the allreduced grads_and_vars. For now, the
# clip_by_global_norm will be moved to before the explicit allreduce to
# keep the math the same as TF 1 and pre TF 2.2 implementation.
(grads, _) = tf.clip_by_global_norm(
grads, clip_norm=self.gradient_clip_norm)
return super(AdamWeightDecay, self).apply_gradients(
zip(grads, tvars),
name=name,
experimental_aggregate_gradients=experimental_aggregate_gradients)
def _get_lr(self, var_device, var_dtype, apply_state):
"""Retrieves the learning rate with the given state."""
if apply_state is None:
return self._decayed_lr_t[var_dtype], {}
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients['lr_t'], dict(apply_state=apply_state)
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay,
self)._resource_apply_dense(grad, var, **kwargs) # pytype: disable=attribute-error # typed-keras
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay,
self)._resource_apply_sparse(grad, var, indices, **kwargs) # pytype: disable=attribute-error # typed-keras
def get_config(self):
config = super(AdamWeightDecay, self).get_config()
config.update({
'weight_decay_rate': self.weight_decay_rate,
})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -23,7 +23,7 @@ import tensorflow as tf
def _make_offset_wrapper(new_class_name: str, base_lr_class):
"""Generates a offset wrapper of learning rate schedule.
It will returns a subclass of the the `base_lr_class`, the subclass takes an
It will returns a subclass of the `base_lr_class`, the subclass takes an
`offset` argument in the constructor. When the new class instance is called,
the behavior is:
new_class_object(step) = base_lr_class_object(step - offset)
......@@ -386,11 +386,11 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
}
class StepConsineDecayWithOffset(
class StepCosineDecayWithOffset(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""Stepwise cosine learning rate decay with offset.
Learning rate is equivalent to one or more consine decay(s) starting and
Learning rate is equivalent to one or more cosine decay(s) starting and
ending at each interval.
ExampleL
......@@ -399,7 +399,7 @@ class StepConsineDecayWithOffset(
boundaries: [100000, 110000]
values: [1.0, 0.5]
lr_decayed_fn = (
lr_schedule.StepConsineDecayWithOffset(
lr_schedule.StepCosineDecayWithOffset(
boundaries,
values))
```
......@@ -412,7 +412,7 @@ class StepConsineDecayWithOffset(
boundaries,
values,
offset: int = 0,
name: str = "StepConsineDecayWithOffset"):
name: str = "StepCosineDecayWithOffset"):
"""Initialize configuration of the learning rate schedule.
Args:
......@@ -444,7 +444,7 @@ class StepConsineDecayWithOffset(
] + [0])
def __call__(self, global_step):
with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
with tf.name_scope(self.name or "StepCosineDecayWithOffset"):
global_step = tf.cast(global_step - self.offset, tf.float32)
lr_levels = self.values
lr_steps = self.boundaries
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -23,22 +23,38 @@ from official.modeling.optimization import slide_optimizer
from official.modeling.optimization import adafactor_optimizer
from official.modeling.optimization import ema_optimizer
from official.modeling.optimization import lars_optimizer
from official.modeling.optimization import legacy_adamw
from official.modeling.optimization import lr_schedule
from official.modeling.optimization.configs import optimization_config as opt_cfg
from official.nlp import optimization as nlp_optimization
OPTIMIZERS_CLS = {
'sgd': tf.keras.optimizers.SGD,
'adam': tf.keras.optimizers.Adam,
'adamw': nlp_optimization.AdamWeightDecay,
# Optimizer CLS to be used in both legacy and new path.
SHARED_OPTIMIZERS = {
'sgd_experimental': tf.keras.optimizers.experimental.SGD,
'adam_experimental': tf.keras.optimizers.experimental.Adam,
'adamw': legacy_adamw.AdamWeightDecay,
'adamw_experimental': tf.keras.optimizers.experimental.AdamW,
'lamb': tfa_optimizers.LAMB,
'rmsprop': tf.keras.optimizers.RMSprop,
'lars': lars_optimizer.LARS,
'adagrad': tf.keras.optimizers.Adagrad,
'slide': slide_optimizer.SLIDE,
'adafactor': adafactor_optimizer.Adafactor,
}
LEGACY_OPTIMIZERS_CLS = {
'sgd': tf.keras.optimizers.legacy.SGD,
'adam': tf.keras.optimizers.legacy.Adam,
'rmsprop': tf.keras.optimizers.legacy.RMSprop,
'adagrad': tf.keras.optimizers.legacy.Adagrad,
}
LEGACY_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS)
NEW_OPTIMIZERS_CLS = {
'sgd': tf.keras.optimizers.experimental.SGD,
'adam': tf.keras.optimizers.experimental.Adam,
'rmsprop': tf.keras.optimizers.experimental.RMSprop,
'adagrad': tf.keras.optimizers.experimental.Adagrad,
}
NEW_OPTIMIZERS_CLS.update(SHARED_OPTIMIZERS)
LR_CLS = {
'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
'polynomial': lr_schedule.PolynomialDecayWithOffset,
......@@ -47,7 +63,7 @@ LR_CLS = {
'power': lr_schedule.DirectPowerDecay,
'power_linear': lr_schedule.PowerAndLinearDecay,
'power_with_offset': lr_schedule.PowerDecayWithOffset,
'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
'step_cosine_with_offset': lr_schedule.StepCosineDecayWithOffset,
}
WARMUP_CLS = {
......@@ -56,8 +72,13 @@ WARMUP_CLS = {
}
def register_optimizer_cls(
key: str, optimizer_config_cls: tf.keras.optimizers.Optimizer):
def register_optimizer_cls(key: str,
optimizer_config_cls: Union[
tf.keras.optimizers.Optimizer,
tf.keras.optimizers.legacy.Optimizer,
tf.keras.optimizers.experimental.Optimizer
],
use_legacy_optimizer: bool = True):
"""Register customize optimizer cls.
The user will still need to subclass data classes in
......@@ -66,10 +87,16 @@ def register_optimizer_cls(
Args:
key: A string to that the optimizer_config_cls is registered with.
optimizer_config_cls: A class which inherits tf.keras.optimizers.Optimizer.
use_legacy_optimizer: A boolean that indicates if using legacy optimizers.
"""
if key in OPTIMIZERS_CLS:
raise ValueError('%s already registered in OPTIMIZER_CLS.' % key)
OPTIMIZERS_CLS[key] = optimizer_config_cls
if use_legacy_optimizer:
if key in LEGACY_OPTIMIZERS_CLS:
raise ValueError('%s already registered in LEGACY_OPTIMIZERS_CLS.' % key)
LEGACY_OPTIMIZERS_CLS[key] = optimizer_config_cls
else:
if key in NEW_OPTIMIZERS_CLS:
raise ValueError('%s already registered in NEW_OPTIMIZERS_CLS.' % key)
NEW_OPTIMIZERS_CLS[key] = optimizer_config_cls
class OptimizerFactory:
......@@ -84,6 +111,8 @@ class OptimizerFactory:
(4) Build optimizer.
This is a typical example for using this class:
```
params = {
'optimizer': {
'type': 'sgd',
......@@ -103,6 +132,7 @@ class OptimizerFactory:
opt_factory = OptimizerFactory(opt_config)
lr = opt_factory.build_learning_rate()
optimizer = opt_factory.build_optimizer(lr)
```
"""
def __init__(self, config: opt_cfg.OptimizationConfig):
......@@ -155,11 +185,15 @@ class OptimizerFactory:
def build_optimizer(
self,
lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
gradient_aggregator: Optional[Callable[
[List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor,
tf.Tensor]]]] = None,
gradient_transformers: Optional[List[Callable[
[List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor, tf.Tensor]]
]]] = None,
[List[Tuple[tf.Tensor, tf.Tensor]]], List[Tuple[tf.Tensor,
tf.Tensor]]]]] = None,
postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
tf.keras.optimizers.Optimizer]] = None):
tf.keras.optimizers.Optimizer]] = None,
use_legacy_optimizer: bool = True):
"""Build optimizer.
Builds optimizer from config. It takes learning rate as input, and builds
......@@ -169,6 +203,7 @@ class OptimizerFactory:
Args:
lr: A floating point value, or a
tf.keras.optimizers.schedules.LearningRateSchedule instance.
gradient_aggregator: Optional function to overwrite gradient aggregation.
gradient_transformers: Optional list of functions to use to transform
gradients before applying updates to Variables. The functions are
applied after gradient_aggregator. The functions should accept and
......@@ -176,9 +211,11 @@ class OptimizerFactory:
global_clipnorm should not be set when gradient_transformers is passed.
postprocessor: An optional function for postprocessing the optimizer. It
takes an optimizer and returns an optimizer.
use_legacy_optimizer: A boolean that indicates if using legacy optimizers.
Returns:
tf.keras.optimizers.Optimizer instance.
`tf.keras.optimizers.legacy.Optimizer` or
`tf.keras.optimizers.experimental.Optimizer` instance.
"""
optimizer_dict = self._optimizer_config.as_dict()
......@@ -191,18 +228,39 @@ class OptimizerFactory:
del optimizer_dict['global_clipnorm']
optimizer_dict['learning_rate'] = lr
if gradient_aggregator is not None:
optimizer_dict['gradient_aggregator'] = gradient_aggregator
if gradient_transformers is not None:
optimizer_dict['gradient_transformers'] = gradient_transformers
optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
if use_legacy_optimizer:
optimizer = LEGACY_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
else:
if 'decay' in optimizer_dict:
raise ValueError(
'`decay` is deprecated in new Keras optimizer, please reflect the '
'decay logic in `lr` or set `use_legacy_optimizer=True` to use the '
'legacy optimizer.')
optimizer = NEW_OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
if self._use_ema:
if not use_legacy_optimizer:
raise ValueError(
'EMA can only work with the legacy optimizer, please set '
'`use_legacy_optimizer=True`.')
optimizer = ema_optimizer.ExponentialMovingAverage(
optimizer, **self._ema_config.as_dict())
if postprocessor:
optimizer = postprocessor(optimizer)
assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
'OptimizerFactory.build_optimizer returning a non-optimizer object: '
'{}'.format(optimizer))
return optimizer
if isinstance(optimizer, tf.keras.optimizers.Optimizer):
return optimizer
# The following check makes sure the function won't break in older TF
# version because of missing the experimental/legacy package.
if hasattr(tf.keras.optimizers, 'experimental'):
if isinstance(optimizer, tf.keras.optimizers.experimental.Optimizer):
return optimizer
if hasattr(tf.keras.optimizers, 'legacy'):
if isinstance(optimizer, tf.keras.optimizers.legacy.Optimizer):
return optimizer
raise TypeError('OptimizerFactory.build_optimizer returning a '
'non-optimizer object: {}'.format(optimizer))
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -37,7 +37,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
}
}
}
optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
optimizer_cls = optimizer_factory.LEGACY_OPTIMIZERS_CLS[optimizer_type]
expected_optimizer_config = optimizer_cls().get_config()
expected_optimizer_config['learning_rate'] = 0.1
......@@ -49,6 +49,72 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
self.assertIsInstance(optimizer, optimizer_cls)
self.assertEqual(expected_optimizer_config, optimizer.get_config())
@parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
('lars'), ('adagrad'))
def test_new_optimizers(self, optimizer_type):
params = {
'optimizer': {
'type': optimizer_type
},
'learning_rate': {
'type': 'constant',
'constant': {
'learning_rate': 0.1
}
}
}
optimizer_cls = optimizer_factory.NEW_OPTIMIZERS_CLS[optimizer_type]
expected_optimizer_config = optimizer_cls().get_config()
expected_optimizer_config['learning_rate'] = 0.1
opt_config = optimization_config.OptimizationConfig(params)
if optimizer_type == 'sgd':
# Delete unsupported arg `decay` from SGDConfig.
delattr(opt_config.optimizer.sgd, 'decay')
opt_factory = optimizer_factory.OptimizerFactory(opt_config)
lr = opt_factory.build_learning_rate()
optimizer = opt_factory.build_optimizer(
lr, postprocessor=lambda x: x, use_legacy_optimizer=False)
self.assertIsInstance(optimizer, optimizer_cls)
self.assertEqual(expected_optimizer_config, optimizer.get_config())
def test_gradient_aggregator(self):
params = {
'optimizer': {
'type': 'adam',
},
'learning_rate': {
'type': 'constant',
'constant': {
'learning_rate': 1.0
}
}
}
opt_config = optimization_config.OptimizationConfig(params)
opt_factory = optimizer_factory.OptimizerFactory(opt_config)
lr = opt_factory.build_learning_rate()
# Dummy function to zero out gradients.
zero_grads = lambda gv: [(tf.zeros_like(g), v) for g, v in gv]
optimizer = opt_factory.build_optimizer(lr, gradient_aggregator=zero_grads)
if isinstance(optimizer, tf.keras.optimizers.experimental.Optimizer):
self.skipTest('New Keras optimizer does not support '
'`gradient_aggregator` arg.')
var0 = tf.Variable([1.0, 2.0])
var1 = tf.Variable([3.0, 4.0])
grads0 = tf.constant([1.0, 1.0])
grads1 = tf.constant([1.0, 1.0])
grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
optimizer.apply_gradients(grads_and_vars)
self.assertAllClose(np.array([1.0, 2.0]), var0.numpy())
self.assertAllClose(np.array([3.0, 4.0]), var1.numpy())
@parameterized.parameters((None, None), (1.0, None), (None, 1.0))
def test_gradient_clipping(self, clipnorm, clipvalue):
params = {
......@@ -107,6 +173,25 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
optimizer_factory.OptimizerFactory(
optimization_config.OptimizationConfig(params))
def test_wrong_return_type(self):
optimizer_type = 'sgd'
params = {
'optimizer': {
'type': optimizer_type
},
'learning_rate': {
'type': 'constant',
'constant': {
'learning_rate': 0.1
}
}
}
opt_config = optimization_config.OptimizationConfig(params)
opt_factory = optimizer_factory.OptimizerFactory(opt_config)
with self.assertRaises(TypeError):
_ = opt_factory.build_optimizer(0.1, postprocessor=lambda x: None)
# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
......@@ -418,7 +503,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
}
}
}
expected_lr_step_values = [[0, 0.0], [5000, 1e-4/2.0], [10000, 1e-4],
expected_lr_step_values = [[0, 0.0], [5000, 1e-4 / 2.0], [10000, 1e-4],
[20000, 9.994863e-05], [499999, 5e-05]]
opt_config = optimization_config.OptimizationConfig(params)
opt_factory = optimizer_factory.OptimizerFactory(opt_config)
......@@ -434,10 +519,12 @@ class OptimizerFactoryRegistryTest(tf.test.TestCase):
class MyClass():
pass
optimizer_factory.register_optimizer_cls('test', MyClass)
self.assertIn('test', optimizer_factory.OPTIMIZERS_CLS)
self.assertIn('test', optimizer_factory.LEGACY_OPTIMIZERS_CLS)
with self.assertRaisesRegex(ValueError, 'test already registered.*'):
optimizer_factory.register_optimizer_cls('test', MyClass)
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Configs for differential privacy."""
import dataclasses
from official.modeling.hyperparams import base_config
@dataclasses.dataclass
class DifferentialPrivacyConfig(base_config.Config):
# Applied to the gradients
# Setting to a large number so nothing is clipped.
clipping_norm: float = 100000000.0 # 10^9
noise_multiplier: float = 0.0
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for configs."""
import tensorflow as tf
from official.modeling.privacy import configs
class ConfigsTest(tf.test.TestCase):
def test_clipping_norm_default(self):
clipping_norm = configs.DifferentialPrivacyConfig().clipping_norm
self.assertEqual(100000000.0, clipping_norm)
def test_noise_multiplier_default(self):
noise_multiplier = configs.DifferentialPrivacyConfig().noise_multiplier
self.assertEqual(0.0, noise_multiplier)
def test_config(self):
dp_config = configs.DifferentialPrivacyConfig(
clipping_norm=1.0,
noise_multiplier=1.0,
)
self.assertEqual(1.0, dp_config.clipping_norm)
self.assertEqual(1.0, dp_config.noise_multiplier)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ops for differential privacy (gradient) transforms."""
from typing import List, Tuple
import tensorflow as tf
def clip_l2_norm(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]],
l2_norm_clip: float) -> List[Tuple[tf.Tensor, tf.Tensor]]:
"""Clip gradients by global norm."""
gradients = []
variables = []
for (g, v) in grads_vars:
gradients.append(g)
variables.append(v)
clipped_gradients = tf.clip_by_global_norm(gradients, l2_norm_clip)[0]
return list(zip(clipped_gradients, variables))
def add_noise(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]],
noise_stddev: float) -> List[Tuple[tf.Tensor, tf.Tensor]]:
"""Add noise to gradients."""
ret = []
for (g, v) in grads_vars:
noise = tf.random.normal(tf.shape(g), stddev=noise_stddev)
ret.append((g + noise, v))
return ret
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for ops."""
from unittest import mock
import tensorflow as tf
from official.modeling.privacy import ops
class OpsTest(tf.test.TestCase):
def test_clip_l2_norm(self):
x = tf.constant([4.0, 3.0])
y = tf.constant([[12.0]])
tensors = [(x, x), (y, y)]
clipped = ops.clip_l2_norm(tensors, 1.0)
for a, b in zip(clipped, tensors):
self.assertAllClose(a[0], b[0] / 13.0) # sqrt(4^2 + 3^2 + 12 ^3) = 13
self.assertAllClose(a[1], b[1])
@mock.patch.object(tf.random,
'normal',
autospec=True)
def test_add_noise(self, mock_random):
x = tf.constant([0.0, 0.0])
y = tf.constant([[0.0]])
tensors = [(x, x), (y, y)]
mock_random.side_effect = [tf.constant([1.0, 1.0]), tf.constant([[1.0]])]
added = ops.add_noise(tensors, 10.0)
for a, b in zip(added, tensors):
self.assertAllClose(a[0], b[0] + 1.0)
self.assertAllClose(a[1], b[1])
_, kwargs = mock_random.call_args
self.assertEqual(kwargs['stddev'], 10.0)
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,6 +14,7 @@
"""Common TF utilities."""
import functools
import six
import tensorflow as tf
......@@ -82,19 +83,22 @@ def is_special_none_tensor(tensor):
return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
def get_activation(identifier, use_keras_layer=False):
"""Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
def get_activation(identifier, use_keras_layer=False, **kwargs):
"""Maps an identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
It checks string first and if it is one of customized activation not in TF,
the corresponding activation will be returned. For non-customized activation
names and callable identifiers, always fallback to tf.keras.activations.get.
Prefers using keras layers when use_keras_layer=True. Now it only supports
'relu', 'linear', 'identity', 'swish'.
'relu', 'linear', 'identity', 'swish', 'mish', 'leaky_relu', and 'gelu'.
Args:
identifier: String name of the activation function or callable.
use_keras_layer: If True, use keras layer if identifier is allow-listed.
**kwargs: Keyword arguments to use to instantiate an activation function.
Available only for 'leaky_relu' and 'gelu' when using keras layers.
For example: get_activation('leaky_relu', use_keras_layer=True, alpha=0.1)
Returns:
A Python function corresponding to the activation function or a keras
......@@ -110,8 +114,11 @@ def get_activation(identifier, use_keras_layer=False):
"swish": "swish",
"sigmoid": "sigmoid",
"relu6": tf.nn.relu6,
"leaky_relu": functools.partial(tf.nn.leaky_relu, **kwargs),
"hard_swish": activations.hard_swish,
"hard_sigmoid": activations.hard_sigmoid,
"mish": activations.mish,
"gelu": functools.partial(tf.nn.gelu, **kwargs),
}
if identifier in keras_layer_allowlist:
return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
......@@ -122,6 +129,7 @@ def get_activation(identifier, use_keras_layer=False):
"relu6": activations.relu6,
"hard_sigmoid": activations.hard_sigmoid,
"identity": activations.identity,
"mish": activations.mish,
}
if identifier in name_to_fn:
return tf.keras.activations.get(name_to_fn[identifier])
......@@ -201,3 +209,85 @@ def safe_mean(losses):
total = tf.reduce_sum(losses)
num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
return tf.math.divide_no_nan(total, num_elements)
def get_replica_id():
"""Gets replica id depending on the environment."""
context = tf.distribute.get_replica_context()
if context is not None:
return context.replica_id_in_sync_group
else:
raise RuntimeError("Unknown replica context. The `get_replica_id` method "
"relies on TF 2.x tf.distribute API.")
def cross_replica_concat(value, axis, name="cross_replica_concat"):
"""Concatenates the given `value` across (GPU/TPU) cores, along `axis`.
In general, each core ("replica") will pass a
replica-specific value as `value` (corresponding to some element of a
data-parallel computation taking place across replicas).
The resulting concatenated `Tensor` will have the same shape as `value` for
all dimensions except `axis`, where it will be larger by a factor of the
number of replicas. It will also have the same `dtype` as `value`.
The position of a given replica's `value` within the resulting concatenation
is determined by that replica's replica ID. For
example:
With `value` for replica 0 given as
0 0 0
0 0 0
and `value` for replica 1 given as
1 1 1
1 1 1
the resulting concatenation along axis 0 will be
0 0 0
0 0 0
1 1 1
1 1 1
and this result will be identical across all replicas.
Note that this API only works in TF2 with `tf.distribute`.
Args:
value: The `Tensor` to concatenate across replicas. Each replica will have a
different value for this `Tensor`, and these replica-specific values will
be concatenated.
axis: The axis along which to perform the concatenation as a Python integer
(not a `Tensor`). E.g., `axis=0` to concatenate along the batch dimension.
name: A name for the operation (used to create a name scope).
Returns:
The result of concatenating `value` along `axis` across replicas.
Raises:
RuntimeError: when the batch (0-th) dimension is None.
"""
with tf.name_scope(name):
context = tf.distribute.get_replica_context()
# Typically this could be hit only if the tensor is derived from a
# dataset with finite epochs and drop_remainder=False, where the last
# batch could of different batch size and then the dim-0 is of dynamic
# shape.
if value.shape.as_list()[0] is None:
raise RuntimeError(f"{value} has unknown batch.")
return context.all_gather(value, axis=axis)
def clone_initializer(initializer):
# Keras initializer is going to be stateless, which mean reusing the same
# initializer will produce same init value when the shapes are the same.
if isinstance(initializer, tf.keras.initializers.Initializer):
return initializer.__class__.from_config(initializer.get_config())
# When the input is string/dict or other serialized configs, caller will
# create a new keras Initializer instance based on that, and we don't need to
# do anything
return initializer
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tf_utils."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.modeling import tf_utils
def all_strategy_combinations():
return combinations.combine(
strategy=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.mirrored_strategy_with_two_gpus,
],
mode='eager',
)
class TFUtilsTest(tf.test.TestCase, parameterized.TestCase):
@combinations.generate(all_strategy_combinations())
def test_cross_replica_concat(self, strategy):
num_cores = strategy.num_replicas_in_sync
shape = (2, 3, 4)
def concat(axis):
@tf.function
def function():
replica_value = tf.fill(shape, tf_utils.get_replica_id())
return tf_utils.cross_replica_concat(replica_value, axis=axis)
return function
def expected(axis):
values = [np.full(shape, i) for i in range(num_cores)]
return np.concatenate(values, axis=axis)
per_replica_results = strategy.run(concat(axis=0))
replica_0_result = per_replica_results.values[0].numpy()
for value in per_replica_results.values[1:]:
self.assertAllClose(value.numpy(), replica_0_result)
self.assertAllClose(replica_0_result, expected(axis=0))
replica_0_result = strategy.run(concat(axis=1)).values[0].numpy()
self.assertAllClose(replica_0_result, expected(axis=1))
replica_0_result = strategy.run(concat(axis=2)).values[0].numpy()
self.assertAllClose(replica_0_result, expected(axis=2))
@combinations.generate(all_strategy_combinations())
def test_cross_replica_concat_gradient(self, strategy):
num_cores = strategy.num_replicas_in_sync
shape = (10, 5)
@tf.function
def function():
replica_value = tf.random.normal(shape)
with tf.GradientTape() as tape:
tape.watch(replica_value)
concat_value = tf_utils.cross_replica_concat(replica_value, axis=0)
output = tf.reduce_sum(concat_value)
return tape.gradient(output, replica_value)
per_replica_gradients = strategy.run(function)
for gradient in per_replica_gradients.values:
self.assertAllClose(gradient, num_cores * tf.ones(shape))
@parameterized.parameters(('relu', True), ('relu', False),
('leaky_relu', False), ('leaky_relu', True),
('mish', True), ('mish', False), ('gelu', True))
def test_get_activations(self, name, use_keras_layer):
fn = tf_utils.get_activation(name, use_keras_layer)
self.assertIsNotNone(fn)
@combinations.generate(all_strategy_combinations())
def test_get_leaky_relu_layer(self, strategy):
@tf.function
def forward(x):
fn = tf_utils.get_activation(
'leaky_relu', use_keras_layer=True, alpha=0.1)
return strategy.run(fn, args=(x,)).values[0]
got = forward(tf.constant([-1]))
self.assertAllClose(got, tf.constant([-0.1]))
if __name__ == '__main__':
tf.test.main()
six
google-api-python-client>=1.6.7
kaggle>=1.3.9
numpy>=1.20
oauth2client
pandas>=0.22.0
psutil>=5.4.3
py-cpuinfo>=3.3.0
scipy>=0.19.1
tensorflow-hub>=0.6.0
tensorflow-model-optimization>=0.4.1
tensorflow-datasets
tfa-nightly
gin-config
tf_slim>=1.1.0
Cython
matplotlib
# Loader becomes a required positional argument in 6.0 in yaml.load
pyyaml>=5.1,<6.0
# CV related dependencies
opencv-python-headless==4.5.2.52
Pillow
pycocotools
# NLP related dependencies
seqeval
sentencepiece
sacrebleu
# Projects/vit dependencies
immutabledict
......@@ -2,53 +2,69 @@
## Introduction
This TF-NLP library provides a collection of scripts for the training and
evaluation of transformer-based models, on various tasks such as sentence
The TF-NLP library provides a collection of scripts for training and
evaluating transformer-based models, on various tasks such as sentence
classification, question answering, and translation. Additionally, we provide
checkpoints of pretrained models which can be finetuned on downstream tasks.
### How to Train Models
Model Garden can be easily installed using PIP
(`pip install tf-models-nightly`). After installation, check out
Model Garden can be easily installed with
`pip install tf-models-nightly`. After installation, check out
[this instruction](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
on how to train models with this codebase.
## Available Tasks
There are two available model configs (we will add more) under
`configs/experiments/`:
By default, the experiment runs on GPUs. To run on TPUs, one should overwrite
`runtime.distribution_strategy` and set the tpu address. See [RuntimeConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) for details.
In general, the experiments can run with the folloing command by setting the
corresponding `${TASK}`, `${TASK_CONFIG}`, `${MODEL_CONFIG}`.
```
EXPERIMENT=???
TASK_CONFIG=???
MODEL_CONFIG=???
EXRTRA_PARAMS=???
MODEL_DIR=??? # a-folder-to-hold-checkpoints-and-logs
python3 train.py \
--experiment=${EXPERIMENT} \
--mode=train_and_eval \
--model_dir=${MODEL_DIR} \
--config_file=${TASK_CONFIG} \
--config_file=${MODEL_CONFIG} \
--params_override=${EXRTRA_PARAMS}
```
* `EXPERIMENT` can be found under `configs/`
* `TASK_CONFIG` can be found under `configs/experiments/`
* `MODEL_CONFIG` can be found under `configs/models/`
#### Order of params override:
1. `train.py` looks up the registered `ExperimentConfig` with `${EXPERIMENT}`
2. Overrides params in `TaskConfig` in `${TASK_CONFIG}`
3. Overrides params `model` in `TaskConfig` with `${MODEL_CONFIG}`
4. Overrides any params in `ExperimentConfig` with `${EXTRA_PARAMS}`
Note that
1. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` can be optional when EXPERIMENT default is enough.
2. `${TASK_CONFIG}`, `${MODEL_CONFIG}`, `${EXTRA_PARAMS}` are only guaranteed to be compatible to it's `${EXPERIMENT}` that defines it.
## Experiments
| NAME | EXPERIMENT | TASK_CONFIG | MODEL_CONFIG | EXRTRA_PARAMS |
| ----------------- | ------------------------ | ------- | -------- | ----------- |
| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
| BERT-base GLUE/MNLI-matched finetune | [bert/sentence_prediction](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base ckpt init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.init_checkpoint=gs://tf_model_garden/nlp/bert/uncased_L-12_H-768_A-12/bert_model.ckpt </details> |
| BERT-base SQuAD v1.1 finetune | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | <details> <summary>data and bert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 </details> |
|ALBERT-base SQuAD v1.1 finetune | [bert/squad](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py) | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | [albert_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/albert_base.yaml)| <details> <summary>data and albert-base hub init</summary>task.train_data.input_path=/path-to-your-training-data,task.validation_data.input_path=/path-to-your-val-data,task.hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/3 </details>|
| Transformer-large WMT14/en-de scratch |[wmt_transformer/large](https://github.com/tensorflow/models/blob/master/official/nlp/configs/wmt_transformer_experiments.py)| | | <details> <summary>ende-32k sentencepiece</summary>task.sentencepiece_model_path='gs://tf_model_garden/nlp/transformer_wmt/ende_bpe_32k.model'</details> |
| Dataset | Task | Config | Example command |
| ----------------- | ------------------------ | ------- | ---- |
| GLUE/MNLI-matched | bert/sentence_prediction | [glue_mnli_matched.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/glue_mnli_matched.yaml) | <details> <summary>finetune BERT-base on this task</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/> --experiment=bert/sentence_prediction \\<br/> --mode=train \\<br/> --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/> --config_file=configs/models/bert_en_uncased_base.yaml \\<br/> --config_file=configs/experiments/glue_mnli_matched.yaml \\<br/> --params_override=${PARAMS}</details> |
| SQuAD v1.1 | bert/squad | [squad_v1.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiments/squad_v1.yaml) | <details> <summary>finetune BERT-base on this task</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/> --experiment=bert/squad \\<br/> --mode=train \\<br/> --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/> --config_file=configs/models/bert_en_uncased_base.yaml \\<br/> --config_file=configs/experiments/squad_v1.yaml \\<br/> --params_override=${PARAMS}</details> |
One example on how to use the config file: if you want to work on the SQuAD
question answering task, set
`--config_file=configs/experiments/squad_v1.yaml` and
`--experiment=bert/squad`
as arguments to `train.py`.
## Available Model Configs
There are two available model configs (we will add more) under
`configs/models/`:
| Model | Config | Pretrained checkpoint & Vocabulary | TF-HUB SavedModel | Example command |
| ------------ | ------- | ---------------------------------- | ----------------- | --------------- |
| BERT-base | [bert_en_uncased_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml) | [uncased_L-12_H-768_A-12](https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz) | [uncased_L-12_H-768_A-12](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/) | <details> <summary>finetune on SQuAD v1.1</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4<br/><br/>python3 train.py \\<br/> --experiment=bert/squad \\<br/> --mode=train \\<br/> --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/> --config_file=configs/models/bert_en_uncased_base.yaml \\<br/> --config_file=configs/experiments/squad_v1.yaml \\<br/> --params_override=${PARAMS}</details> |
| ALBERT-base | [albert_base.yaml](https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/albert_base.yaml) | [albert_en_base](https://storage.googleapis.com/tf_model_garden/nlp/albert/albert_base.tar.gz) | [albert_en_base](https://tfhub.dev/tensorflow/albert_en_base/3) | <details> <summary>finetune on SQuAD v1.1</summary> PARAMS=runtime.distribution_strategy=mirrored<br/>PARAMS=${PARAMS},task.train_data.input_path=/path-to-your-training-data/<br/>PARAMS=${PARAMS},task.hub_module_url=https://tfhub.dev/tensorflow/albert_en_base/3<br/><br/>python3 train.py \\<br/> --experiment=bert/squad \\<br/> --mode=train \\<br/> --model_dir=/a-folder-to-hold-checkpoints-and-logs/ \\<br/> --config_file=configs/models/albert_base.yaml \\<br/> --config_file=configs/experiments/squad_v1.yaml \\<br/> --params_override=${PARAMS}</details> |
One example on how to use the config file: if you want to train an ALBERT-base
model, set `--config_file=configs/models/albert_base.yaml` as an argument to
`train.py`.
## Useful links
[How to Train Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
[List of Pretrained Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)
[List of Pretrained Models for finetuning](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md)
[How to Publish Models](https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md)
......
# TensorFlow NLP Modelling Toolkit
# TF-NLP Model Garden
This codebase provides a Natrual Language Processing modeling toolkit written in
⚠️ Disclaimer: All datasets hyperlinked from this page are not owned or
distributed by Google. The dataset is made available by third parties. Please
review the terms and conditions made available by the third parties before using
the data.
This codebase provides a Natural Language Processing modeling toolkit written in
[TF2](https://www.tensorflow.org/guide/effective_tf2). It allows researchers and
developers to reproduce state-of-the-art model results and train custom models
to experiment new research ideas.
## Features
* Reusable and modularized modeling building blocks
* State-of-the-art reproducible
* Easy to customize and extend
* End-to-end training
* Distributed trainable on both GPUs and TPUs
* Reusable and modularized modeling building blocks
* State-of-the-art reproducible
* Easy to customize and extend
* End-to-end training
* Distributed trainable on both GPUs and TPUs
## Major components
### Libraries
We provide modeling library to allow users to train custom models for new
research ideas. Detailed intructions can be found in READMEs in each folder.
research ideas. Detailed instructions can be found in READMEs in each folder.
* [modeling/](modeling): modeling library that provides building blocks
(e.g.,Layers, Networks, and Models) that can be assembled into
transformer-based achitectures .
transformer-based architectures.
* [data/](data): binaries and utils for input preprocessing, tokenization,
etc.
......@@ -30,27 +35,29 @@ research ideas. Detailed intructions can be found in READMEs in each folder.
We provide SoTA model implementations, pre-trained models, training and
evaluation examples, and command lines. Detail instructions can be found in the
READMEs for specific papers.
READMEs for specific papers. Below are some papers implemented in the repository
and more NLP projects can be found in the
[`projects`](https://github.com/tensorflow/models/tree/master/official/projects)
folder:
1. [BERT](MODEL_GARDEN.md#available-model-configs): [BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
2018
1. [BERT](MODEL_GARDEN.md#available-model-configs): [BERT: Pre-training of Deep
Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al., 2018
2. [ALBERT](MODEL_GARDEN.md#available-model-configs):
[A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
by Lan et al., 2019
3. [XLNet](xlnet):
3. [XLNet](MODEL_GARDEN.md):
[XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
by Yang et al., 2019
4. [Transformer for translation](transformer):
4. [Transformer for translation](MODEL_GARDEN.md#available-model-configs):
[Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
al., 2017
### Common Training Driver
We provide a single common driver [train.py](train.py) to train above SoTA
models on popluar tasks. Please see [docs/train.md](docs/train.md) for
more details.
models on popular tasks. Please see [docs/train.md](docs/train.md) for more
details.
### Pre-trained models with checkpoints and TF-Hub
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# BERT (Bidirectional Encoder Representations from Transformers)
**WARNING**: We are on the way to deprecate most of the code in this directory.
Please see
[this link](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
for the new tutorial and use the new code in `nlp/modeling`. This README is
still correct for this legacy implementation.
The academic paper which describes BERT in detail and provides full results on a
number of tasks can be found here: https://arxiv.org/abs/1810.04805.
This repository contains TensorFlow 2.x implementation for BERT.
## Contents
* [Contents](#contents)
* [Pre-trained Models](#pre-trained-models)
* [Restoring from Checkpoints](#restoring-from-checkpoints)
* [Set Up](#set-up)
* [Process Datasets](#process-datasets)
* [Fine-tuning with BERT](#fine-tuning-with-bert)
* [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
* [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
* [SQuAD 1.1](#squad-1.1)
## Pre-trained Models
We released both checkpoints and tf.hub modules as the pretrained models for
fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
released in TF 1.x official BERT repository
[google-research/bert](https://github.com/google-research/bert)
in order to keep consistent with BERT paper.
### Access to Pretrained Checkpoints
Pretrained checkpoints can be found in the following links:
**Note: We have switched BERT implementation
to use Keras functional-style networks in [nlp/modeling](../modeling).
The new checkpoints are:**
* **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
12-layer, 768-hidden, 12-heads , 110M parameters
* **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
We recommend to host checkpoints on Google Cloud storage buckets when you use
Cloud GPU/TPU.
### Restoring from Checkpoints
`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
weights from provided pre-trained checkpoints, you can use the following code:
```python
init_checkpoint='the pretrained model checkpoint path.'
model=tf.keras.Model() # Bert pre-trained model as feature extractor.
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(init_checkpoint)
```
Checkpoints featuring native serialized Keras models
(i.e. model.load()/load_weights()) will be available soon.
### Access to Pretrained hub modules.
Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
following links:
* **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)**:
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)**:
12-layer, 768-hidden, 12-heads , 110M parameters
* **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)**:
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
110M parameters
## Set Up
```shell
export PYTHONPATH="$PYTHONPATH:/path/to/models"
```
Install `tf-nightly` to get latest updates:
```shell
pip install tf-nightly-gpu
```
With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
```shell
ctpu up -name <instance name> --tf-version=”nightly”
```
Second, you need to install TF 2 `tf-nightly` on your VM:
```shell
pip install tf-nightly
```
## Process Datasets
### Pre-training
There is no change to generate pre-training data. Please use the script
[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
to get processed pre-training data and it adapts to TF2 symbols and python3
compatibility.
Running the pre-training script requires an input and output directory, as well as a vocab file. Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
Example shell script to call create_pretraining_data.py
```
export WORKING_DIR='local disk or cloud location'
export BERT_DIR='local disk or cloud location'
python models/official/nlp/data/create_pretraining_data.py \
--input_file=$WORKING_DIR/input/input.txt \
--output_file=$WORKING_DIR/output/tf_examples.tfrecord \
--vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=5
```
### Fine-tuning
To prepare the fine-tuning data for final model training, use the
[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
Resulting datasets in `tf_record` format and training meta data should be later
passed to training or evaluation scripts. The task-specific arguments are
described in following sections:
* GLUE
Users can download the
[GLUE data](https://gluebenchmark.com/tasks) by running
[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
and unpack it to some directory `$GLUE_DIR`.
Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
```shell
export GLUE_DIR=~/glue
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export TASK_NAME=MNLI
export OUTPUT_DIR=gs://some_bucket/datasets
python ../data/create_finetuning_data.py \
--input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
--vocab_file=${BERT_DIR}/vocab.txt \
--train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
--eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
--meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
--fine_tuning_task_type=classification --max_seq_length=128 \
--classification_task_name=${TASK_NAME}
```
* SQUAD
The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
detailed information about the SQuAD datasets and evaluation.
The necessary files can be found here:
* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
* [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
* [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
* [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
```shell
export SQUAD_DIR=~/squad
export SQUAD_VERSION=v1.1
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export OUTPUT_DIR=gs://some_bucket/datasets
python ../data/create_finetuning_data.py \
--squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
--vocab_file=${BERT_DIR}/vocab.txt \
--train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
--meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
--fine_tuning_task_type=squad --max_seq_length=384
```
Note: To create fine-tuning data with SQUAD 2.0, you need to add flag `--version_2_with_negative=True`.
## Fine-tuning with BERT
### Cloud GPUs and TPUs
* Cloud Storage
The unzipped pre-trained model files can also be found in the Google Cloud
Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
```shell
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export MODEL_DIR=gs://some_bucket/my_output_dir
```
Currently, users are able to access to `tf-nightly` TPUs and the following TPU
script should run with `tf-nightly`.
* GPU -> TPU
Just add the following flags to `run_classifier.py` or `run_squad.py`:
```shell
--distribution_strategy=tpu
--tpu=grpc://${TPU_IP_ADDRESS}:8470
```
### Sentence and Sentence-pair Classification Tasks
This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
few minutes on most GPUs.
We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
workflow.
For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
(uncased_L-12_H-768_A-12).
```shell
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export MODEL_DIR=gs://some_bucket/my_output_dir
export GLUE_DIR=gs://some_bucket/datasets
export TASK=MRPC
python run_classifier.py \
--mode='train_and_eval' \
--input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
--train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
--eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
--bert_config_file=${BERT_DIR}/bert_config.json \
--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
--train_batch_size=4 \
--eval_batch_size=4 \
--steps_per_loop=1 \
--learning_rate=2e-5 \
--num_train_epochs=3 \
--model_dir=${MODEL_DIR} \
--distribution_strategy=mirrored
```
Alternatively, instead of specifying `init_checkpoint`, you can specify
`hub_module_url` to employ a pretraind BERT hub module, e.g.,
` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
After training a model, to get predictions from the classifier, you can set the
`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
Output will be created in file called test_results.tsv in the output folder.
Each line will contain output for each sample, columns are the class
probabilities.
```shell
python run_classifier.py \
--mode='predict' \
--input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
--eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
--bert_config_file=${BERT_DIR}/bert_config.json \
--eval_batch_size=4 \
--model_dir=${MODEL_DIR} \
--distribution_strategy=mirrored
```
To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
information and use remote storage for model checkpoints.
```shell
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export TPU_IP_ADDRESS='???'
export MODEL_DIR=gs://some_bucket/my_output_dir
export GLUE_DIR=gs://some_bucket/datasets
export TASK=MRPC
python run_classifier.py \
--mode='train_and_eval' \
--input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
--train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
--eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
--bert_config_file=${BERT_DIR}/bert_config.json \
--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
--train_batch_size=32 \
--eval_batch_size=32 \
--steps_per_loop=1000 \
--learning_rate=2e-5 \
--num_train_epochs=3 \
--model_dir=${MODEL_DIR} \
--distribution_strategy=tpu \
--tpu=grpc://${TPU_IP_ADDRESS}:8470
```
Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
training steps inside a `tf.function` can significantly increase TPU utilization
and callbacks will not be called inside the loop.
### SQuAD 1.1
The Stanford Question Answering Dataset (SQuAD) is a popular question answering
benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
workflow.
For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
(uncased_L-12_H-768_A-12).
```shell
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export SQUAD_DIR=gs://some_bucket/datasets
export MODEL_DIR=gs://some_bucket/my_output_dir
export SQUAD_VERSION=v1.1
python run_squad.py \
--input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
--train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
--predict_file=${SQUAD_DIR}/dev-v1.1.json \
--vocab_file=${BERT_DIR}/vocab.txt \
--bert_config_file=${BERT_DIR}/bert_config.json \
--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
--train_batch_size=4 \
--predict_batch_size=4 \
--learning_rate=8e-5 \
--num_train_epochs=2 \
--model_dir=${MODEL_DIR} \
--distribution_strategy=mirrored
```
Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
specify a hub module path.
`run_squad.py` writes the prediction for `--predict_file` by default. If you set
the `--model=predict` and offer the SQuAD test data, the scripts will generate
the prediction json file.
To use TPU, you need switch distribution strategy type to `tpu` with TPU
information.
```shell
export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
export TPU_IP_ADDRESS='???'
export MODEL_DIR=gs://some_bucket/my_output_dir
export SQUAD_DIR=gs://some_bucket/datasets
export SQUAD_VERSION=v1.1
python run_squad.py \
--input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
--train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
--predict_file=${SQUAD_DIR}/dev-v1.1.json \
--vocab_file=${BERT_DIR}/vocab.txt \
--bert_config_file=${BERT_DIR}/bert_config.json \
--init_checkpoint=${BERT_DIR}/bert_model.ckpt \
--train_batch_size=32 \
--learning_rate=8e-5 \
--num_train_epochs=2 \
--model_dir=${MODEL_DIR} \
--distribution_strategy=tpu \
--tpu=grpc://${TPU_IP_ADDRESS}:8470
```
The dev set predictions will be saved into a file called predictions.json in the
model_dir:
```shell
python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
```
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment