Commit 78c43ef1 authored by Gunho Park's avatar Gunho Park
Browse files

Merge branch 'master' of https://github.com/tensorflow/models

parents 67cfc95b e3c7e300
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Adafactor optimizer.
A new optimizer that will be open sourced soon.
"""
# pylint: disable=invalid-name, represents an unimplemented class definition.
Adafactor = "Unimplemented"
......@@ -56,10 +56,12 @@ class StepwiseLrConfig(base_config.Config):
values[0] [boundaries[0], boundaries[1]] -> values[1]
[boundaries[n-1], boundaries[n]] -> values[n] [boundaries[n],
end] -> values[n+1] Defaults to None.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'PiecewiseConstantDecay'
boundaries: Optional[List[int]] = None
values: Optional[List[float]] = None
offset: int = 0
@dataclasses.dataclass
......@@ -76,12 +78,14 @@ class ExponentialLrConfig(base_config.Config):
decay_rate: A float. Defaults to None.
staircase: A boolean, if true, learning rate is decreased at discreate
intervals. Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'ExponentialDecay'
initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None
decay_rate: Optional[float] = None
staircase: Optional[bool] = None
offset: int = 0
@dataclasses.dataclass
......@@ -99,6 +103,7 @@ class PolynomialLrConfig(base_config.Config):
power: A float. The power of the polynomial. Defaults to linear, 1.0.
cycle: A boolean, whether or not it should cycle beyond decay_steps.
Defaults to False.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'PolynomialDecay'
initial_learning_rate: Optional[float] = None
......@@ -106,6 +111,7 @@ class PolynomialLrConfig(base_config.Config):
end_learning_rate: float = 0.0001
power: float = 1.0
cycle: bool = False
offset: int = 0
@dataclasses.dataclass
......@@ -122,11 +128,13 @@ class CosineLrConfig(base_config.Config):
to None.
alpha: A float. Minimum learning rate value as a fraction of
initial_learning_rate.
offset: An int. The offset applied to steps. Defaults to 0.
"""
name: str = 'CosineDecay'
initial_learning_rate: Optional[float] = None
decay_steps: Optional[int] = None
alpha: float = 0.0
offset: int = 0
@dataclasses.dataclass
......
......@@ -52,6 +52,7 @@ class OptimizerConfig(oneof.OneOfConfig):
lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()
@dataclasses.dataclass
......
......@@ -247,3 +247,22 @@ class SLIDEConfig(BaseOptimizerConfig):
do_gradient_rescaling: bool = True
norm_type: str = "layer"
ratio_clip_norm: float = 1e5
@dataclasses.dataclass
class AdafactorConfig(BaseOptimizerConfig):
"""Configuration for Adafactor optimizer.
The attributes for this class matches the arguments of the Adafactor
implementation.
"""
name: str = "Adafactor"
factored: bool = True
multiply_by_parameter_scale: bool = True
beta1: Optional[float] = None
decay_rate: float = 0.8
step_offset: int = 0
clipping_threshold: float = 1.0
min_dim_size_to_factor: int = 128
epsilon1: float = 1e-30
epsilon2: float = 1e-3
......@@ -19,6 +19,75 @@ from typing import Mapping, Any, Union, Optional
import tensorflow as tf
def _make_offset_wrapper(new_class_name: str, base_lr_class):
"""Generates a offset wrapper of learning rate schedule.
It will returns a subclass of the the `base_lr_class`, the subclass takes an
`offset` argument in the constructor. When the new class instance is called,
the behavior is:
new_class_object(step) = base_lr_class_object(step - offset)
Example:
CosineDecayWithOffset = _make_offset_wrapper(
'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
# Use the lr:
lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
decay_steps=1000)
lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
Args:
new_class_name: the name of the new class.
base_lr_class: the base learning rate schedule class. Should be subclass of
tf.keras.optimizers.schedules.LearningRateSchedule
Returns:
A new class (subclass of the base_lr_class) that can take an offset.
"""
assert issubclass(base_lr_class,
tf.keras.optimizers.schedules.LearningRateSchedule), (
"base_lr_class should be subclass of keras "
f"LearningRateSchedule, got {base_lr_class}")
# pylint: disable=protected-access,pointless-statement
def offset_learning_rate_init(self, offset=0, **kwargs):
"""Construct learning rate schedule object.
When this object is called, its behavior is
self.__call__(step) == base_lr_class.__call__(step - offset)
Args:
self: this object.
offset: The offset when computing the learning rate schedule.
**kwargs: Pass through to base learning rate class constructor.
"""
base_lr_class.__init__(self, **kwargs)
self._offset = offset
def offset_learning_rate_call(self, step):
step = tf.cast(step - self._offset, tf.float32)
return base_lr_class.__call__(self, step)
# pylint: enable=protected-access,pointless-statement
return type(
new_class_name, (base_lr_class,), {
"base_lr_class": base_lr_class,
"__init__": offset_learning_rate_init,
"__call__": offset_learning_rate_call
})
PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
"PiecewiseConstantDecayWithOffset",
tf.keras.optimizers.schedules.PiecewiseConstantDecay)
PolynomialDecayWithOffset = _make_offset_wrapper(
"PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
ExponentialDecayWithOffset = _make_offset_wrapper(
"ExponentialDecayWithOffset",
tf.keras.optimizers.schedules.ExponentialDecay)
CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
tf.keras.experimental.CosineDecay)
class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Linear warmup schedule."""
......
......@@ -70,5 +70,40 @@ class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
self.assertAlmostEqual(lr(step).numpy(), value)
class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
dict(class_name=lr_schedule.PolynomialDecayWithOffset),
dict(class_name=lr_schedule.ExponentialDecayWithOffset),
dict(class_name=lr_schedule.CosineDecayWithOffset),
)
def test_generated_docstring(self, class_name):
self.assertNotEmpty(class_name.__init__.__doc__)
@parameterized.parameters(
dict(
class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
dict(
class_name=lr_schedule.PolynomialDecayWithOffset,
kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
dict(
class_name=lr_schedule.ExponentialDecayWithOffset,
kwarg=dict(
initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
dict(
class_name=lr_schedule.CosineDecayWithOffset,
kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
)
def test_offset(self, class_name, kwarg):
offset = 10
offset_lr = class_name(offset=offset, **kwarg)
base_lr = class_name.base_lr_class(**kwarg)
self.assertIsInstance(offset_lr, class_name)
for step in range(10, 101, 10):
self.assertEqual(offset_lr(step), base_lr(step - offset))
if __name__ == '__main__':
tf.test.main()
......@@ -20,6 +20,7 @@ import tensorflow as tf
import tensorflow_addons.optimizers as tfa_optimizers
from official.modeling.optimization import slide_optimizer
from official.modeling.optimization import adafactor_optimizer
from official.modeling.optimization import ema_optimizer
from official.modeling.optimization import lars_optimizer
from official.modeling.optimization import lr_schedule
......@@ -34,14 +35,15 @@ OPTIMIZERS_CLS = {
'rmsprop': tf.keras.optimizers.RMSprop,
'lars': lars_optimizer.LARS,
'adagrad': tf.keras.optimizers.Adagrad,
'slide': slide_optimizer.SLIDE
'slide': slide_optimizer.SLIDE,
'adafactor': adafactor_optimizer.Adafactor,
}
LR_CLS = {
'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay,
'polynomial': tf.keras.optimizers.schedules.PolynomialDecay,
'exponential': tf.keras.optimizers.schedules.ExponentialDecay,
'cosine': tf.keras.experimental.CosineDecay,
'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
'polynomial': lr_schedule.PolynomialDecayWithOffset,
'exponential': lr_schedule.ExponentialDecayWithOffset,
'cosine': lr_schedule.CosineDecayWithOffset,
'power': lr_schedule.DirectPowerDecay,
'power_linear': lr_schedule.PowerAndLinearDecay,
'power_with_offset': lr_schedule.PowerDecayWithOffset,
......
......@@ -14,29 +14,16 @@
"""Functions and classes related to training performance."""
from absl import logging
import tensorflow as tf
def configure_optimizer(optimizer,
use_float16=False,
use_graph_rewrite=False,
loss_scale='dynamic',
use_experimental_api=False):
loss_scale=None):
"""Configures optimizer object with performance options."""
if use_experimental_api:
logging.warning('Passing use_experimental_api=True is deprecated. The '
'argument will be removed in the future.')
if use_float16:
# TODO(b/171936854): Move all methods to non-experimental api.
if use_experimental_api:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
optimizer, loss_scale=loss_scale))
elif loss_scale == 'dynamic':
if loss_scale in (None, 'dynamic'):
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
else:
# loss_scale is a number. We interpret that as a fixed loss scale.
......@@ -52,34 +39,17 @@ def configure_optimizer(optimizer,
return optimizer
def set_mixed_precision_policy(dtype, loss_scale=None,
use_experimental_api=False):
"""Sets mix precision policy."""
if use_experimental_api:
logging.warning('Passing use_experimental_api=True is deprecated. The '
'argument will be removed in the future.')
assert use_experimental_api or loss_scale is None, (
'loss_scale cannot be specified if use_experimental_api is False. If the '
'non-experimental API is used, specify the loss scaling configuration '
'when creating the LossScaleOptimizer instead.'
)
def set_mixed_precision_policy(dtype, loss_scale=None):
"""Sets the global `tf.keras.mixed_precision.Policy`."""
# TODO(b/191894773): Remove loss_scale argument
assert loss_scale is None, (
'The loss_scale argument must be None. The argument exists for '
'historical reasons and will be removed soon.')
if dtype == tf.float16:
# TODO(b/171936854): Move all methods to non-experimental api.
if use_experimental_api:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
tf.keras.mixed_precision.experimental.set_policy(policy)
else:
tf.keras.mixed_precision.set_global_policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy('mixed_float16')
elif dtype == tf.bfloat16:
if use_experimental_api:
tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
else:
tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
elif dtype == tf.float32:
if use_experimental_api:
tf.keras.mixed_precision.experimental.set_policy('float32')
else:
tf.keras.mixed_precision.set_global_policy('float32')
tf.keras.mixed_precision.set_global_policy('float32')
else:
raise ValueError('Unexpected dtype: %s' % dtype)
......@@ -108,6 +108,7 @@ def get_activation(identifier, use_keras_layer=False):
"linear": "linear",
"identity": "linear",
"swish": "swish",
"sigmoid": "sigmoid",
"relu6": tf.nn.relu6,
}
if identifier in keras_layer_allowlist:
......
......@@ -46,6 +46,8 @@ class BertEncoderConfig(hyperparams.Config):
embedding_size: Optional[int] = None
output_range: Optional[int] = None
return_all_encoder_outputs: bool = False
# Pre/Post-LN Transformer
norm_first: bool = False
@dataclasses.dataclass
......@@ -132,6 +134,8 @@ class BigBirdEncoderConfig(hyperparams.Config):
intermediate_size: int = 3072
dropout_rate: float = 0.1
attention_dropout_rate: float = 0.1
# Pre/Post-LN Transformer
norm_first: bool = False
max_position_embeddings: int = 4096
num_rand_blocks: int = 3
block_size: int = 64
......@@ -152,6 +156,8 @@ class KernelEncoderConfig(hyperparams.Config):
intermediate_size: int = 3072
dropout_rate: float = 0.1
attention_dropout_rate: float = 0.1
# Pre/Post-LN Transformer
norm_first: bool = False
max_position_embeddings: int = 512
type_vocab_size: int = 2
initializer_range: float = 0.02
......@@ -161,6 +167,7 @@ class KernelEncoderConfig(hyperparams.Config):
redraw: bool = False
is_short_seq: bool = False
begin_kernel: int = 0
scale: Optional[float] = None
@dataclasses.dataclass
......@@ -339,6 +346,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate,
norm_first=encoder_cfg.norm_first,
kernel_initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
attention_cls=layers.BigBirdAttention,
......@@ -377,6 +385,7 @@ def build_encoder(config: EncoderConfig,
redraw=encoder_cfg.redraw,
is_short_seq=encoder_cfg.is_short_seq,
begin_kernel=encoder_cfg.begin_kernel,
scale=encoder_cfg.scale,
)
hidden_cfg = dict(
num_attention_heads=encoder_cfg.num_attention_heads,
......@@ -385,6 +394,7 @@ def build_encoder(config: EncoderConfig,
encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate,
norm_first=encoder_cfg.norm_first,
kernel_initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
attention_cls=layers.KernelAttention,
......@@ -445,4 +455,5 @@ def build_encoder(config: EncoderConfig,
embedding_width=encoder_cfg.embedding_size,
embedding_layer=embedding_layer,
return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
dict_outputs=True)
dict_outputs=True,
norm_first=encoder_cfg.norm_first)
......@@ -28,7 +28,6 @@ from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
from official.modeling.multitask import configs
from official.modeling.multitask import multitask
from official.modeling.multitask import train_lib as multitask_train_lib
......@@ -167,7 +166,10 @@ def run_continuous_finetune(
with distribution_strategy.scope():
if isinstance(params, configs.MultiEvalExperimentConfig):
task = task_factory.get_task(params_replaced.task)
eval_tasks = multitask.MultiTask.from_config(params_replaced.eval_tasks)
eval_tasks = [
task_factory.get_task(config.task_config, name=config.task_name)
for config in params.eval_tasks
]
(_,
eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval(
distribution_strategy=distribution_strategy,
......
......@@ -129,24 +129,52 @@ class DataProcessor(object):
lines.append(json.loads(json_str))
return lines
def featurize_example(self, *kargs, **kwargs):
"""Converts a single `InputExample` into a single `InputFeatures`."""
return convert_single_example(*kargs, **kwargs)
class DefaultGLUEDataProcessor(DataProcessor):
"""Processor for the SuperGLUE dataset."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("validation")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("test")
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
raise NotImplementedError()
class AxProcessor(DataProcessor):
"""Processor for the AX dataset (GLUE diagnostics dataset)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
train_mnli_dataset = tfds.load(
"glue/mnli", split="train", try_gcs=True).as_numpy_iterator()
return self._create_examples_tfds(train_mnli_dataset, "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
val_mnli_dataset = tfds.load(
"glue/mnli", split="validation_matched",
try_gcs=True).as_numpy_iterator()
return self._create_examples_tfds(val_mnli_dataset, "validation")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
test_ax_dataset = tfds.load(
"glue/ax", split="test", try_gcs=True).as_numpy_iterator()
return self._create_examples_tfds(test_ax_dataset, "test")
def get_labels(self):
"""See base class."""
......@@ -157,46 +185,26 @@ class AxProcessor(DataProcessor):
"""See base class."""
return "AX"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, dataset, set_type):
"""Creates examples for the training/dev/test sets."""
text_a_index = 1 if set_type == "test" else 8
text_b_index = 2 if set_type == "test" else 9
examples = []
for i, line in enumerate(lines):
# Skip header.
if i == 0:
continue
guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
text_a = self.process_text_fn(line[text_a_index])
text_b = self.process_text_fn(line[text_b_index])
if set_type == "test":
label = "contradiction"
else:
label = self.process_text_fn(line[-1])
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
label = "contradiction"
text_a = self.process_text_fn(example["hypothesis"])
text_b = self.process_text_fn(example["premise"])
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
class ColaProcessor(DataProcessor):
class ColaProcessor(DefaultGLUEDataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def __init__(self, process_text_fn=tokenization.convert_to_unicode):
super(ColaProcessor, self).__init__(process_text_fn)
self.dataset = tfds.load("glue/cola", try_gcs=True)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("validation")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples_tfds("test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
......@@ -208,7 +216,8 @@ class ColaProcessor(DataProcessor):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = self.dataset[set_type].as_numpy_iterator()
dataset = tfds.load(
"glue/cola", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
......@@ -267,34 +276,28 @@ class MnliProcessor(DataProcessor):
mnli_type="matched",
process_text_fn=tokenization.convert_to_unicode):
super(MnliProcessor, self).__init__(process_text_fn)
self.dataset = tfds.load("glue/mnli", try_gcs=True)
if mnli_type not in ("matched", "mismatched"):
raise ValueError("Invalid `mnli_type`: %s" % mnli_type)
self.mnli_type = mnli_type
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
return self._create_examples_tfds("train")
def get_dev_examples(self, data_dir):
"""See base class."""
if self.mnli_type == "matched":
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
"dev_matched")
return self._create_examples_tfds("validation_matched")
else:
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
"dev_mismatched")
return self._create_examples_tfds("validation_mismatched")
def get_test_examples(self, data_dir):
"""See base class."""
if self.mnli_type == "matched":
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
return self._create_examples_tfds("test_matched")
else:
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test")
return self._create_examples_tfds("test_mismatched")
def get_labels(self):
"""See base class."""
......@@ -305,42 +308,28 @@ class MnliProcessor(DataProcessor):
"""See base class."""
return "MNLI"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/mnli", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
text_a = self.process_text_fn(line[8])
text_b = self.process_text_fn(line[9])
if set_type == "test":
label = "contradiction"
else:
label = self.process_text_fn(line[-1])
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
label = "contradiction"
text_a = self.process_text_fn(example["hypothesis"])
text_b = self.process_text_fn(example["premise"])
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
class MrpcProcessor(DataProcessor):
class MrpcProcessor(DefaultGLUEDataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
......@@ -350,21 +339,22 @@ class MrpcProcessor(DataProcessor):
"""See base class."""
return "MRPC"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/mrpc", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
text_a = self.process_text_fn(line[3])
text_b = self.process_text_fn(line[4])
if set_type == "test":
label = "0"
else:
label = self.process_text_fn(line[0])
label = "0"
text_a = self.process_text_fn(example["sentence1"])
text_b = self.process_text_fn(example["sentence2"])
if set_type != "test":
label = str(example["label"])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
......@@ -447,24 +437,9 @@ class PawsxProcessor(DataProcessor):
return "XTREME-PAWS-X"
class QnliProcessor(DataProcessor):
class QnliProcessor(DefaultGLUEDataProcessor):
"""Processor for the QNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["entailment", "not_entailment"]
......@@ -474,44 +449,28 @@ class QnliProcessor(DataProcessor):
"""See base class."""
return "QNLI"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/qnli", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, 1)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[2])
label = "entailment"
else:
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[2])
label = tokenization.convert_to_unicode(line[-1])
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
label = "entailment"
text_a = self.process_text_fn(example["question"])
text_b = self.process_text_fn(example["sentence"])
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
class QqpProcessor(DataProcessor):
class QqpProcessor(DefaultGLUEDataProcessor):
"""Processor for the QQP data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
......@@ -521,48 +480,28 @@ class QqpProcessor(DataProcessor):
"""See base class."""
return "QQP"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/qqp", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
if set_type == "test":
text_a = line[1]
text_b = line[2]
label = "0"
else:
# There appear to be some garbage lines in the train dataset.
try:
text_a = line[3]
text_b = line[4]
label = line[5]
except IndexError:
continue
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
label = "0"
text_a = self.process_text_fn(example["question1"])
text_b = self.process_text_fn(example["question2"])
if set_type != "test":
label = str(example["label"])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
class RteProcessor(DataProcessor):
class RteProcessor(DefaultGLUEDataProcessor):
"""Processor for the RTE data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
# All datasets are converted to 2-class split, where for 3-class datasets we
......@@ -574,42 +513,28 @@ class RteProcessor(DataProcessor):
"""See base class."""
return "RTE"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/rte", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[2])
if set_type == "test":
label = "entailment"
else:
label = tokenization.convert_to_unicode(line[3])
label = "entailment"
text_a = self.process_text_fn(example["sentence1"])
text_b = self.process_text_fn(example["sentence2"])
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
class SstProcessor(DataProcessor):
class SstProcessor(DefaultGLUEDataProcessor):
"""Processor for the SST-2 data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
......@@ -619,25 +544,24 @@ class SstProcessor(DataProcessor):
"""See base class."""
return "SST-2"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/sst2", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[1])
label = "0"
else:
text_a = tokenization.convert_to_unicode(line[0])
label = tokenization.convert_to_unicode(line[1])
label = "0"
text_a = self.process_text_fn(example["sentence"])
if set_type != "test":
label = str(example["label"])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=None, label=label, weight=None))
return examples
class StsBProcessor(DataProcessor):
class StsBProcessor(DefaultGLUEDataProcessor):
"""Processor for the STS-B data set (GLUE version)."""
def __init__(self, process_text_fn=tokenization.convert_to_unicode):
......@@ -646,20 +570,23 @@ class StsBProcessor(DataProcessor):
self.label_type = float
self._labels = None
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/stsb", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
label = 0.0
text_a = self.process_text_fn(example["sentence1"])
text_b = self.process_text_fn(example["sentence2"])
if set_type != "test":
label = self.label_type(example["label"])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
def get_labels(self):
"""See base class."""
......@@ -670,23 +597,6 @@ class StsBProcessor(DataProcessor):
"""See base class."""
return "STS-B"
def _create_examples(self, lines, set_type):
"""Creates examples for the training/dev/test sets."""
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[7])
text_b = tokenization.convert_to_unicode(line[8])
if set_type == "test":
label = 0.0
else:
label = self.label_type(tokenization.convert_to_unicode(line[9]))
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class TfdsProcessor(DataProcessor):
"""Processor for generic text classification and regression TFDS data set.
......@@ -816,24 +726,9 @@ class TfdsProcessor(DataProcessor):
return examples
class WnliProcessor(DataProcessor):
class WnliProcessor(DefaultGLUEDataProcessor):
"""Processor for the WNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
......@@ -843,21 +738,22 @@ class WnliProcessor(DataProcessor):
"""See base class."""
return "WNLI"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"glue/wnli", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for i, line in enumerate(lines):
if i == 0:
continue
for i, example in enumerate(dataset):
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[1])
text_b = tokenization.convert_to_unicode(line[2])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[3])
label = "0"
text_a = self.process_text_fn(example["sentence1"])
text_b = self.process_text_fn(example["sentence2"])
if set_type != "test":
label = str(example["label"])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label,
weight=None))
return examples
......@@ -1314,30 +1210,7 @@ class AXgProcessor(DataProcessor):
return examples
class SuperGLUEDataProcessor(DataProcessor):
"""Processor for the SuperGLUE dataset."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
def _create_examples(self, lines, set_type):
"""Creates examples for the training/dev/test sets."""
raise NotImplementedError()
class BoolQProcessor(SuperGLUEDataProcessor):
class BoolQProcessor(DefaultGLUEDataProcessor):
"""Processor for the BoolQ dataset (SuperGLUE diagnostics dataset)."""
def get_labels(self):
......@@ -1349,23 +1222,24 @@ class BoolQProcessor(SuperGLUEDataProcessor):
"""See base class."""
return "BoolQ"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"super_glue/boolq", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for line in lines:
guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
text_a = self.process_text_fn(line["question"])
text_b = self.process_text_fn(line["passage"])
if set_type == "test":
label = "False"
else:
label = str(line["label"])
for example in dataset:
guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
text_a = self.process_text_fn(example["question"])
text_b = self.process_text_fn(example["passage"])
label = "False"
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class CBProcessor(SuperGLUEDataProcessor):
class CBProcessor(DefaultGLUEDataProcessor):
"""Processor for the CB dataset (SuperGLUE diagnostics dataset)."""
def get_labels(self):
......@@ -1377,23 +1251,24 @@ class CBProcessor(SuperGLUEDataProcessor):
"""See base class."""
return "CB"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
dataset = tfds.load(
"super_glue/cb", split=set_type, try_gcs=True).as_numpy_iterator()
examples = []
for line in lines:
guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
text_a = self.process_text_fn(line["premise"])
text_b = self.process_text_fn(line["hypothesis"])
if set_type == "test":
label = "entailment"
else:
label = self.process_text_fn(line["label"])
for example in dataset:
guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
text_a = self.process_text_fn(example["premise"])
text_b = self.process_text_fn(example["hypothesis"])
label = "entailment"
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class SuperGLUERTEProcessor(SuperGLUEDataProcessor):
class SuperGLUERTEProcessor(DefaultGLUEDataProcessor):
"""Processor for the RTE dataset (SuperGLUE version)."""
def get_labels(self):
......@@ -1407,28 +1282,163 @@ class SuperGLUERTEProcessor(SuperGLUEDataProcessor):
"""See base class."""
return "RTESuperGLUE"
def _create_examples(self, lines, set_type):
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
examples = []
for i, line in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = self.process_text_fn(line["premise"])
text_b = self.process_text_fn(line["hypothesis"])
if set_type == "test":
label = "entailment"
else:
label = self.process_text_fn(line["label"])
dataset = tfds.load(
"super_glue/rte", split=set_type, try_gcs=True).as_numpy_iterator()
for example in dataset:
guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
text_a = self.process_text_fn(example["premise"])
text_b = self.process_text_fn(example["hypothesis"])
label = "entailment"
if set_type != "test":
label = self.get_labels()[example["label"]]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class WiCInputExample(InputExample):
"""Processor for the WiC dataset (SuperGLUE version)."""
def __init__(self,
guid,
text_a,
text_b=None,
label=None,
word=None,
weight=None,
example_id=None):
"""A single training/test example for simple seq regression/classification."""
super(WiCInputExample, self).__init__(guid, text_a, text_b, label, weight,
example_id)
self.word = word
class WiCProcessor(DefaultGLUEDataProcessor):
"""Processor for the RTE dataset (SuperGLUE version)."""
def get_labels(self):
"""Not used."""
return []
@staticmethod
def get_processor_name():
"""See base class."""
return "RTESuperGLUE"
def _create_examples_tfds(self, set_type):
"""Creates examples for the training/dev/test sets."""
examples = []
dataset = tfds.load(
"super_glue/wic", split=set_type, try_gcs=True).as_numpy_iterator()
for example in dataset:
guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"])))
text_a = self.process_text_fn(example["sentence1"])
text_b = self.process_text_fn(example["sentence2"])
word = self.process_text_fn(example["word"])
label = 0
if set_type != "test":
label = example["label"]
examples.append(
WiCInputExample(
guid=guid, text_a=text_a, text_b=text_b, word=word, label=label))
return examples
def featurize_example(self, ex_index, example, label_list, max_seq_length,
tokenizer):
"""Here we concate sentence1, sentence2, word together with [SEP] tokens."""
del label_list
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = tokenizer.tokenize(example.text_b)
tokens_word = tokenizer.tokenize(example.word)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP], [SEP] with "- 4"
# Here we only pop out the first two sentence tokens.
_truncate_seq_pair(tokens_a, tokens_b,
max_seq_length - 4 - len(tokens_word))
seg_id_a = 0
seg_id_b = 1
seg_id_c = 2
seg_id_cls = 0
seg_id_pad = 0
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(seg_id_cls)
for token in tokens_a:
tokens.append(token)
segment_ids.append(seg_id_a)
tokens.append("[SEP]")
segment_ids.append(seg_id_a)
for token in tokens_b:
tokens.append(token)
segment_ids.append(seg_id_b)
tokens.append("[SEP]")
segment_ids.append(seg_id_b)
for token in tokens_word:
tokens.append(token)
segment_ids.append(seg_id_c)
tokens.append("[SEP]")
segment_ids.append(seg_id_c)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(seg_id_pad)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = example.label
if ex_index < 5:
logging.info("*** Example ***")
logging.info("guid: %s", (example.guid))
logging.info("tokens: %s",
" ".join([tokenization.printable_text(x) for x in tokens]))
logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
logging.info("label: %s (id = %s)", example.label, str(label_id))
logging.info("weight: %s", example.weight)
logging.info("example_id: %s", example.example_id)
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id,
is_real_example=True,
weight=example.weight,
example_id=example.example_id)
return feature
def file_based_convert_examples_to_features(examples,
label_list,
max_seq_length,
tokenizer,
output_file,
label_type=None):
label_type=None,
featurize_fn=None):
"""Convert a set of `InputExample`s to a TFRecord file."""
tf.io.gfile.makedirs(os.path.dirname(output_file))
......@@ -1438,8 +1448,12 @@ def file_based_convert_examples_to_features(examples,
if ex_index % 10000 == 0:
logging.info("Writing example %d of %d", ex_index, len(examples))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
if featurize_fn:
feature = featurize_fn(ex_index, example, label_list, max_seq_length,
tokenizer)
else:
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
......@@ -1528,7 +1542,8 @@ def generate_tf_record_from_data_file(processor,
file_based_convert_examples_to_features(train_input_data_examples,
label_list, max_seq_length,
tokenizer, train_data_output_path,
label_type)
label_type,
processor.featurize_example)
num_training_data = len(train_input_data_examples)
if eval_data_output_path:
......@@ -1536,7 +1551,8 @@ def generate_tf_record_from_data_file(processor,
file_based_convert_examples_to_features(eval_input_data_examples,
label_list, max_seq_length,
tokenizer, eval_data_output_path,
label_type)
label_type,
processor.featurize_example)
meta_data = {
"processor_type": processor.get_processor_name(),
......@@ -1550,13 +1566,15 @@ def generate_tf_record_from_data_file(processor,
for language, examples in test_input_data_examples.items():
file_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer,
test_data_output_path.format(language), label_type)
test_data_output_path.format(language), label_type,
processor.featurize_example)
meta_data["test_{}_data_size".format(language)] = len(examples)
else:
file_based_convert_examples_to_features(test_input_data_examples,
label_list, max_seq_length,
tokenizer, test_data_output_path,
label_type)
label_type,
processor.featurize_example)
meta_data["test_data_size"] = len(test_input_data_examples)
if is_regression:
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
import os
import tempfile
from absl.testing import parameterized
import tensorflow as tf
import tensorflow_datasets as tfds
from official.nlp.bert import tokenization
from official.nlp.data import classifier_data_lib
def decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
return tf.io.parse_single_example(record, name_to_features)
class BertClassifierLibTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(BertClassifierLibTest, self).setUp()
self.model_dir = self.get_temp_dir()
self.processors = {
"CB": classifier_data_lib.CBProcessor,
"SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
"BOOLQ": classifier_data_lib.BoolQProcessor,
"WIC": classifier_data_lib.WiCProcessor,
}
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens
]).encode("utf-8"))
vocab_file = vocab_writer.name
self.tokenizer = tokenization.FullTokenizer(vocab_file)
@parameterized.parameters(
{"task_type": "CB"},
{"task_type": "BOOLQ"},
{"task_type": "SUPERGLUE-RTE"},
{"task_type": "WIC"},
)
def test_generate_dataset_from_tfds_processor(self, task_type):
with tfds.testing.mock_data(num_examples=5):
output_path = os.path.join(self.model_dir, task_type)
processor = self.processors[task_type]()
classifier_data_lib.generate_tf_record_from_data_file(
processor,
None,
self.tokenizer,
train_data_output_path=output_path,
eval_data_output_path=output_path,
test_data_output_path=output_path)
files = tf.io.gfile.glob(output_path)
self.assertNotEmpty(files)
train_dataset = tf.data.TFRecordDataset(output_path)
seq_length = 128
label_type = tf.int64
name_to_features = {
"input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.io.FixedLenFeature([], label_type),
}
train_dataset = train_dataset.map(
lambda record: decode_record(record, name_to_features))
# If data is retrieved without error, then all requirements
# including data type/shapes are met.
_ = next(iter(train_dataset))
if __name__ == "__main__":
tf.test.main()
......@@ -50,7 +50,7 @@ flags.DEFINE_enum(
"classification_task_name", "MNLI", [
"AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
"SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
"AX-g", "SUPERGLUE-RTE", "CB", "BoolQ"
"AX-g", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC"
], "The name of the task to train BERT classifier. The "
"difference between XTREME-XNLI and XNLI is: 1. the format "
"of input tsv files; 2. the dev set for XTREME is english "
......@@ -173,8 +173,26 @@ flags.DEFINE_string(
def generate_classifier_dataset():
"""Generates classifier dataset and returns input meta data."""
assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
FLAGS.tfds_params)
if FLAGS.classification_task_name in [
"COLA",
"WNLI",
"SST-2",
"MRPC",
"QQP",
"STS-B",
"MNLI",
"QNLI",
"RTE",
"AX",
"SUPERGLUE-RTE",
"CB",
"BoolQ",
"WIC",
]:
assert not FLAGS.input_data_dir or FLAGS.tfds_params
else:
assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
FLAGS.tfds_params)
if FLAGS.tokenization == "WordPiece":
tokenizer = tokenization.FullTokenizer(
......@@ -248,6 +266,8 @@ def generate_classifier_dataset():
classifier_data_lib.CBProcessor,
"boolq":
classifier_data_lib.BoolQProcessor,
"wic":
classifier_data_lib.WnliProcessor,
}
task_name = FLAGS.classification_task_name.lower()
if task_name not in processors:
......
......@@ -60,8 +60,8 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
else:
self._label_name_mapping = dict()
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
def name_to_features_spec(self):
"""Defines features to decode. Subclass may override to append features."""
label_type = LABEL_TYPES_MAP[self._params.label_type]
name_to_features = {
'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
......@@ -72,7 +72,11 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
if self._include_example_id:
name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features)
return name_to_features
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
example = tf.io.parse_single_example(record, self.name_to_features_spec())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
......@@ -86,20 +90,23 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
def _parse(self, record: Mapping[str, tf.Tensor]):
"""Parses raw tensors into a dict of tensors to be consumed by the model."""
x = {
'input_word_ids': record['input_ids'],
'input_mask': record['input_mask'],
'input_type_ids': record['segment_ids']
key_mapping = {
'input_ids': 'input_word_ids',
'input_mask': 'input_mask',
'segment_ids': 'input_type_ids'
}
if self._include_example_id:
x['example_id'] = record['example_id']
x[self._label_field] = record[self._label_field]
ret = {}
for record_key in record:
if record_key in key_mapping:
ret[key_mapping[record_key]] = record[record_key]
else:
ret[record_key] = record[record_key]
if self._label_field in self._label_name_mapping:
x[self._label_name_mapping[self._label_field]] = record[self._label_field]
ret[self._label_name_mapping[self._label_field]] = record[
self._label_field]
return x
return ret
def load(self, input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a tf.dataset.Dataset."""
......@@ -215,13 +222,12 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
"""Berts preprocess."""
segments = [record[x] for x in self._text_fields]
model_inputs = self._text_processor(segments)
if self._include_example_id:
model_inputs['example_id'] = record['example_id']
model_inputs[self._label_field] = record[self._label_field]
for key in record:
if key not in self._text_fields:
model_inputs[key] = record[key]
return model_inputs
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
def name_to_features_spec(self):
name_to_features = {}
for text_field in self._text_fields:
name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
......@@ -230,8 +236,11 @@ class SentencePredictionTextDataLoader(data_loader.DataLoader):
name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
if self._include_example_id:
name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
example = tf.io.parse_single_example(record, name_to_features)
return name_to_features
def _decode(self, record: tf.Tensor):
"""Decodes a serialized tf.Example."""
example = tf.io.parse_single_example(record, self.name_to_features_spec())
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in example:
......
......@@ -198,9 +198,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset))
label_field = data_config.label_field
self.assertCountEqual(
['input_word_ids', 'input_type_ids', 'input_mask', label_field],
features.keys())
expected_keys = [
'input_word_ids', 'input_type_ids', 'input_mask', label_field
]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
......@@ -233,9 +236,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset))
label_field = data_config.label_field
self.assertCountEqual(
['input_word_ids', 'input_type_ids', 'input_mask', label_field],
features.keys())
expected_keys = [
'input_word_ids', 'input_type_ids', 'input_mask', label_field
]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
......@@ -268,9 +274,12 @@ class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
dataset = loader.SentencePredictionTextDataLoader(data_config).load()
features = next(iter(dataset))
label_field = data_config.label_field
self.assertCountEqual(
['input_word_ids', 'input_type_ids', 'input_mask', label_field],
features.keys())
expected_keys = [
'input_word_ids', 'input_type_ids', 'input_mask', label_field
]
if use_tfds:
expected_keys += ['idx']
self.assertCountEqual(expected_keys, features.keys())
self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
......
......@@ -69,6 +69,9 @@ class BertEncoder(tf.keras.Model):
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
"""
def __init__(
......@@ -87,6 +90,7 @@ class BertEncoder(tf.keras.Model):
output_range=None,
embedding_width=None,
embedding_layer=None,
norm_first=False,
**kwargs):
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
......@@ -162,6 +166,7 @@ class BertEncoder(tf.keras.Model):
inner_activation=inner_activation,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=transformer_output_range,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
......@@ -211,6 +216,7 @@ class BertEncoder(tf.keras.Model):
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
......
......@@ -205,7 +205,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
initializer="glorot_uniform",
output_range=-1,
embedding_width=16,
embedding_layer=None)
embedding_layer=None,
norm_first=False)
network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize(
......
......@@ -48,12 +48,12 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
test_layer = position_embedding.PositionEmbedding(
max_length=sequence_length, seq_axis=2)
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width, width))
input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width, width]
expected_output_shape = [None, width, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float32, output_tensor.dtype)
......
......@@ -249,7 +249,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
attention.
Returns:
An ouput tensor with the same dimensions as input/query tensor.
An output tensor with the same dimensions as input/query tensor.
"""
if isinstance(inputs, (list, tuple)):
if len(inputs) == 2:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment