Commit 35112d1c authored by Hongkun Yu's avatar Hongkun Yu Committed by saberkun
Browse files

Internal change

PiperOrigin-RevId: 268543563
parent 1577ed07
# TensorFlow Natural Language Processing Models
tensorflow/models/official/nlp is a library of state-of-the-art models for
Natural Language Processing (NLP).
The library currently contains TensorFlow 2.x implementations, pre-trained
model weights, usage scripts and conversion utilities for the following models:
* Bert
* [XLNet](xlnet)
* Transformer for translation
# XLNet: Generalized Autoregressive Pretraining for Language Understanding
The academic paper which describes XLNet in detail and provides full results on
a number of tasks can be found here: https://arxiv.org/abs/1906.08237.
Instructions and user guide will be added soon.
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for pre-processing classification data."""
from absl import flags
from absl import logging
from official.nlp.xlnet import data_utils
FLAGS = flags.FLAGS
SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4
class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
input_ids,
input_mask,
segment_ids,
label_id,
is_real_example=True):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.is_real_example = is_real_example
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenize_fn):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[1] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)
if label_list is not None:
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenize_fn(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenize_fn(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for two [SEP] & one [CLS] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for one [SEP] & one [CLS] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[:max_seq_length - 2]
tokens = []
segment_ids = []
for token in tokens_a:
tokens.append(token)
segment_ids.append(SEG_ID_A)
tokens.append(data_utils.SEP_ID)
segment_ids.append(SEG_ID_A)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(SEG_ID_B)
tokens.append(data_utils.SEP_ID)
segment_ids.append(SEG_ID_B)
tokens.append(data_utils.CLS_ID)
segment_ids.append(SEG_ID_CLS)
input_ids = tokens
# The mask has 0 for real tokens and 1 for padding tokens. Only real
# tokens are attended to.
input_mask = [0] * len(input_ids)
# Zero-pad up to the sequence length.
if len(input_ids) < max_seq_length:
delta_len = max_seq_length - len(input_ids)
input_ids = [0] * delta_len + input_ids
input_mask = [1] * delta_len + input_mask
segment_ids = [SEG_ID_PAD] * delta_len + segment_ids
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
if label_list is not None:
label_id = label_map[example.label]
else:
label_id = example.label
if ex_index < 5:
logging.info("*** Example ***")
logging.info("guid: %s", (example.guid))
logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
logging.info("label: %d (id = %d)", example.label, label_id)
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
return feature
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common flags used in XLNet model."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
from absl import flags
flags.DEFINE_string("master", default=None, help="master")
flags.DEFINE_string(
"tpu",
default=None,
help="The Cloud TPU to use for training. This should be "
"either the name used when creating the Cloud TPU, or a "
"url like grpc://ip.address.of.tpu:8470.")
flags.DEFINE_bool(
"use_tpu", default=True, help="Use TPUs rather than plain CPUs.")
flags.DEFINE_string("tpu_topology", "2x2", help="TPU topology.")
flags.DEFINE_integer(
"num_core_per_host", default=8, help="number of cores per host")
flags.DEFINE_string("model_dir", default=None, help="Estimator model_dir.")
flags.DEFINE_string(
"init_checkpoint",
default=None,
help="Checkpoint path for initializing the model.")
# Optimization config
flags.DEFINE_float("learning_rate", default=1e-4, help="Maximum learning rate.")
flags.DEFINE_float("clip", default=1.0, help="Gradient clipping value.")
flags.DEFINE_float("weight_decay_rate", default=0.0, help="Weight decay rate.")
# lr decay
flags.DEFINE_integer(
"warmup_steps", default=0, help="Number of steps for linear lr warmup.")
flags.DEFINE_float("adam_epsilon", default=1e-8, help="Adam epsilon.")
flags.DEFINE_float(
"lr_layer_decay_rate",
default=1.0,
help="Top layer: lr[L] = FLAGS.learning_rate."
"Lower layers: lr[l-1] = lr[l] * lr_layer_decay_rate.")
flags.DEFINE_float(
"min_lr_ratio", default=0.0, help="Minimum ratio learning rate.")
# Training config
flags.DEFINE_integer(
"train_batch_size",
default=16,
help="Size of the train batch across all hosts.")
flags.DEFINE_integer(
"train_steps", default=100000, help="Total number of training steps.")
flags.DEFINE_integer(
"iterations", default=1000, help="Number of iterations per repeat loop.")
# Data config
flags.DEFINE_integer(
"seq_len", default=0, help="Sequence length for pretraining.")
flags.DEFINE_integer(
"reuse_len",
default=0,
help="How many tokens to be reused in the next batch. "
"Could be half of `seq_len`.")
flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.")
flags.DEFINE_bool(
"bi_data",
default=False,
help="Use bidirectional data streams, "
"i.e., forward & backward.")
flags.DEFINE_integer("n_token", 32000, help="Vocab size")
# Model config
flags.DEFINE_integer("mem_len", default=0, help="Number of steps to cache")
flags.DEFINE_bool("same_length", default=False, help="Same length attention")
flags.DEFINE_integer("clamp_len", default=-1, help="Clamp length")
flags.DEFINE_integer("n_layer", default=6, help="Number of layers.")
flags.DEFINE_integer("d_model", default=32, help="Dimension of the model.")
flags.DEFINE_integer("d_embed", default=32, help="Dimension of the embeddings.")
flags.DEFINE_integer("n_head", default=4, help="Number of attention heads.")
flags.DEFINE_integer(
"d_head", default=8, help="Dimension of each attention head.")
flags.DEFINE_integer(
"d_inner",
default=32,
help="Dimension of inner hidden size in positionwise "
"feed-forward.")
flags.DEFINE_float("dropout", default=0.1, help="Dropout rate.")
flags.DEFINE_float("dropout_att", default=0.1, help="Attention dropout rate.")
flags.DEFINE_bool("untie_r", default=False, help="Untie r_w_bias and r_r_bias")
flags.DEFINE_string(
"ff_activation",
default="relu",
help="Activation type used in position-wise feed-forward.")
flags.DEFINE_string(
"strategy_type",
default="tpu",
help="Activation type used in position-wise feed-forward.")
flags.DEFINE_bool("use_bfloat16", False, help="Whether to use bfloat16.")
# Parameter initialization
flags.DEFINE_enum(
"init_method",
default="normal",
enum_values=["normal", "uniform"],
help="Initialization method.")
flags.DEFINE_float(
"init_std", default=0.02, help="Initialization std when init is normal.")
flags.DEFINE_float(
"init_range", default=0.1, help="Initialization std when init is uniform.")
flags.DEFINE_integer(
"train_data_size", default=130738, help="Number of training data samples.")
flags.DEFINE_integer(
"test_data_size", default=12048, help="Number of test data samples.")
flags.DEFINE_string(
"train_tfrecord_path",
default=None,
help="Path to preprocessed training set tfrecord.")
flags.DEFINE_string(
"test_tfrecord_path",
default=None,
help="Path to preprocessed test set tfrecord.")
flags.DEFINE_integer(
"test_batch_size",
default=16,
help="Size of the test batch across all hosts.")
flags.DEFINE_integer(
"save_steps", default=None, help="Number of steps for saving checkpoint.")
FLAGS = flags.FLAGS
This diff is collapsed.
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
from absl import logging
import tensorflow as tf
from official.bert.optimization import AdamWeightDecay
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def __init__(self,
initial_learning_rate,
decay_schedule_fn,
warmup_steps,
power=1.0,
name=None):
super(WarmUp, self).__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or "WarmUp") as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = (
self.initial_learning_rate *
tf.math.pow(warmup_percent_done, self.power))
return tf.cond(
global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step - self.warmup_steps),
name=name)
def get_config(self):
return {
"initial_learning_rate": self.initial_learning_rate,
"decay_schedule_fn": self.decay_schedule_fn,
"warmup_steps": self.warmup_steps,
"power": self.power,
"name": self.name
}
def create_optimizer(init_lr,
num_train_steps,
num_warmup_steps,
min_lr_ratio=0.0,
adam_epsilon=1e-8,
weight_decay_rate=0.0):
"""Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps - num_warmup_steps,
end_learning_rate=init_lr * min_lr_ratio)
if num_warmup_steps:
learning_rate_fn = WarmUp(
initial_learning_rate=init_lr,
decay_schedule_fn=learning_rate_fn,
warmup_steps=num_warmup_steps)
if weight_decay_rate > 0.0:
logging.info(
"Using AdamWeightDecay with adam_epsilon=%.9f weight_decay_rate=%.3f",
adam_epsilon, weight_decay_rate)
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=weight_decay_rate,
beta_1=0.9,
beta_2=0.999,
epsilon=adam_epsilon,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
include_in_weight_decay=["r_s_bias", "r_r_bias", "r_w_bias"])
else:
logging.info("Using Adam with adam_epsilon=%.9f", (adam_epsilon))
optimizer = tf.keras.optimizers.Adam(
learning_rate=learning_rate_fn, epsilon=adam_epsilon)
return optimizer, learning_rate_fn
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to pre-process classification data into tfrecords."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import csv
import os
from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
import sentencepiece as spm
from official.nlp.xlnet import classifier_utils
from official.nlp.xlnet import preprocess_utils
flags.DEFINE_bool(
"overwrite_data",
default=False,
help="If False, will use cached data if available.")
flags.DEFINE_string("output_dir", default="", help="Output dir for TF records.")
flags.DEFINE_string(
"spiece_model_file", default="", help="Sentence Piece model path.")
flags.DEFINE_string("data_dir", default="", help="Directory for input data.")
# task specific
flags.DEFINE_string("eval_split", default="dev", help="could be dev or test")
flags.DEFINE_string("task_name", default=None, help="Task name")
flags.DEFINE_integer(
"eval_batch_size", default=64, help="batch size for evaluation")
flags.DEFINE_integer("max_seq_length", default=128, help="Max sequence length")
flags.DEFINE_integer(
"num_passes",
default=1,
help="Num passes for processing training data. "
"This is use to batch data without loss for TPUs.")
flags.DEFINE_bool("uncased", default=False, help="Use uncased.")
flags.DEFINE_bool(
"is_regression", default=False, help="Whether it's a regression task.")
FLAGS = flags.FLAGS
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with tf.io.gfile.GFile(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
# pylint: disable=g-explicit-length-test
if len(line) == 0:
continue
lines.append(line)
return lines
class GLUEProcessor(DataProcessor):
"""GLUEProcessor."""
def __init__(self):
self.train_file = "train.tsv"
self.dev_file = "dev.tsv"
self.test_file = "test.tsv"
self.label_column = None
self.text_a_column = None
self.text_b_column = None
self.contains_header = True
self.test_text_a_column = None
self.test_text_b_column = None
self.test_contains_header = True
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, self.train_file)), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, self.dev_file)), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
if self.test_text_a_column is None:
self.test_text_a_column = self.text_a_column
if self.test_text_b_column is None:
self.test_text_b_column = self.text_b_column
return self._create_examples(
self._read_tsv(os.path.join(data_dir, self.test_file)), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0 and self.contains_header and set_type != "test":
continue
if i == 0 and self.test_contains_header and set_type == "test":
continue
guid = "%s-%s" % (set_type, i)
a_column = (
self.text_a_column if set_type != "test" else self.test_text_a_column)
b_column = (
self.text_b_column if set_type != "test" else self.test_text_b_column)
# there are some incomplete lines in QNLI
if len(line) <= a_column:
logging.warning("Incomplete line, ignored.")
continue
text_a = line[a_column]
if b_column is not None:
if len(line) <= b_column:
logging.warning("Incomplete line, ignored.")
continue
text_b = line[b_column]
else:
text_b = None
if set_type == "test":
label = self.get_labels()[0]
else:
if len(line) <= self.label_column:
logging.warning("Incomplete line, ignored.")
continue
label = line[self.label_column]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class Yelp5Processor(DataProcessor):
"""Yelp5Processor."""
def get_train_examples(self, data_dir):
return self._create_examples(os.path.join(data_dir, "train.csv"))
def get_dev_examples(self, data_dir):
return self._create_examples(os.path.join(data_dir, "test.csv"))
def get_labels(self):
"""See base class."""
return ["1", "2", "3", "4", "5"]
def _create_examples(self, input_file):
"""Creates examples for the training and dev sets."""
examples = []
with tf.io.gfile.GFile(input_file) as f:
reader = csv.reader(f)
for i, line in enumerate(reader):
label = line[0]
text_a = line[1].replace('""', '"').replace('\\"', '"')
examples.append(
InputExample(guid=str(i), text_a=text_a, text_b=None, label=label))
return examples
class ImdbProcessor(DataProcessor):
"""ImdbProcessor."""
def get_labels(self):
return ["neg", "pos"]
def get_train_examples(self, data_dir):
return self._create_examples(os.path.join(data_dir, "train"))
def get_dev_examples(self, data_dir):
return self._create_examples(os.path.join(data_dir, "test"))
def _create_examples(self, data_dir):
"""Creates examples."""
examples = []
for label in ["neg", "pos"]:
cur_dir = os.path.join(data_dir, label)
for filename in tf.io.gfile.listdir(cur_dir):
if not filename.endswith("txt"):
continue
if len(examples) % 1000 == 0:
logging.info("Loading dev example %d", len(examples))
path = os.path.join(cur_dir, filename)
with tf.io.gfile.GFile(path) as f:
text = f.read().strip().replace("<br />", " ")
examples.append(
InputExample(
guid="unused_id", text_a=text, text_b=None, label=label))
return examples
class MnliMatchedProcessor(GLUEProcessor):
"""MnliMatchedProcessor."""
def __init__(self):
super(MnliMatchedProcessor, self).__init__()
self.dev_file = "dev_matched.tsv"
self.test_file = "test_matched.tsv"
self.label_column = -1
self.text_a_column = 8
self.text_b_column = 9
def get_labels(self):
return ["contradiction", "entailment", "neutral"]
class MnliMismatchedProcessor(MnliMatchedProcessor):
def __init__(self):
super(MnliMismatchedProcessor, self).__init__()
self.dev_file = "dev_mismatched.tsv"
self.test_file = "test_mismatched.tsv"
class StsbProcessor(GLUEProcessor):
"""StsbProcessor."""
def __init__(self):
super(StsbProcessor, self).__init__()
self.label_column = 9
self.text_a_column = 7
self.text_b_column = 8
def get_labels(self):
return [0.0]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0 and self.contains_header and set_type != "test":
continue
if i == 0 and self.test_contains_header and set_type == "test":
continue
guid = "%s-%s" % (set_type, i)
a_column = (
self.text_a_column if set_type != "test" else self.test_text_a_column)
b_column = (
self.text_b_column if set_type != "test" else self.test_text_b_column)
# there are some incomplete lines in QNLI
if len(line) <= a_column:
logging.warning("Incomplete line, ignored.")
continue
text_a = line[a_column]
if b_column is not None:
if len(line) <= b_column:
logging.warning("Incomplete line, ignored.")
continue
text_b = line[b_column]
else:
text_b = None
if set_type == "test":
label = self.get_labels()[0]
else:
if len(line) <= self.label_column:
logging.warning("Incomplete line, ignored.")
continue
label = float(line[self.label_column])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def file_based_convert_examples_to_features(examples,
label_list,
max_seq_length,
tokenize_fn,
output_file,
num_passes=1):
"""Convert a set of `InputExample`s to a TFRecord file."""
# do not create duplicated records
if tf.io.gfile.exists(output_file) and not FLAGS.overwrite_data:
logging.info("Do not overwrite tfrecord %s exists.", output_file)
return
logging.info("Create new tfrecord %s.", output_file)
writer = tf.io.TFRecordWriter(output_file)
examples *= num_passes
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logging.info("Writing example %d of %d", ex_index, len(examples))
feature = classifier_utils.convert_single_example(ex_index, example,
label_list,
max_seq_length,
tokenize_fn)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
def create_float_feature(values):
f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_float_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
if label_list is not None:
features["label_ids"] = create_int_feature([feature.label_id])
else:
features["label_ids"] = create_float_feature([float(feature.label_id)])
features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
def main(_):
logging.set_verbosity(logging.INFO)
processors = {
"mnli_matched": MnliMatchedProcessor,
"mnli_mismatched": MnliMismatchedProcessor,
"sts-b": StsbProcessor,
"imdb": ImdbProcessor,
"yelp5": Yelp5Processor
}
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels() if not FLAGS.is_regression else None
sp = spm.SentencePieceProcessor()
sp.Load(FLAGS.spiece_model_file)
def tokenize_fn(text):
text = preprocess_utils.preprocess_text(text, lower=FLAGS.uncased)
return preprocess_utils.encode_ids(sp, text)
spm_basename = os.path.basename(FLAGS.spiece_model_file)
train_file_base = "{}.len-{}.train.tf_record".format(spm_basename,
FLAGS.max_seq_length)
train_file = os.path.join(FLAGS.output_dir, train_file_base)
logging.info("Use tfrecord file %s", train_file)
train_examples = processor.get_train_examples(FLAGS.data_dir)
np.random.shuffle(train_examples)
logging.info("Num of train samples: %d", len(train_examples))
file_based_convert_examples_to_features(train_examples, label_list,
FLAGS.max_seq_length, tokenize_fn,
train_file, FLAGS.num_passes)
if FLAGS.eval_split == "dev":
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
else:
eval_examples = processor.get_test_examples(FLAGS.data_dir)
logging.info("Num of eval samples: %d", len(eval_examples))
# TPU requires a fixed batch size for all batches, therefore the number
# of examples must be a multiple of the batch size, or else examples
# will get dropped. So we pad with fake examples which are ignored
# later on. These do NOT count towards the metric (all tf.metrics
# support a per-instance weight, and these get a weight of 0.0).
#
# Modified in XL: We also adopt the same mechanism for GPUs.
while len(eval_examples) % FLAGS.eval_batch_size != 0:
eval_examples.append(classifier_utils.PaddingInputExample())
eval_file_base = "{}.len-{}.{}.eval.tf_record".format(spm_basename,
FLAGS.max_seq_length,
FLAGS.eval_split)
eval_file = os.path.join(FLAGS.output_dir, eval_file_base)
file_based_convert_examples_to_features(eval_examples, label_list,
FLAGS.max_seq_length, tokenize_fn,
eval_file)
if __name__ == "__main__":
assert tf.version.VERSION.startswith('2.')
app.run(main)
This diff is collapsed.
# coding=utf-8
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to pre-process SQUAD data into tfrecords."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import random
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import sentencepiece as spm
from official.nlp.xlnet import squad_utils
flags.DEFINE_integer(
"num_proc", default=1, help="Number of preprocessing processes.")
flags.DEFINE_integer("proc_id", default=0, help="Process id for preprocessing.")
# I/O paths
flags.DEFINE_string("output_dir", default="", help="Output dir for TF records.")
flags.DEFINE_string(
"spiece_model_file", default="", help="Sentence Piece model path.")
flags.DEFINE_string("train_file", default="", help="Path of train file.")
flags.DEFINE_string("predict_file", default="", help="Path of prediction file.")
# Data preprocessing config
flags.DEFINE_integer("max_seq_length", default=512, help="Max sequence length")
flags.DEFINE_integer("max_query_length", default=64, help="Max query length")
flags.DEFINE_integer("doc_stride", default=128, help="Doc stride")
flags.DEFINE_bool("uncased", default=False, help="Use uncased data.")
flags.DEFINE_bool(
"create_train_data", default=True, help="Whether to create training data.")
flags.DEFINE_bool(
"create_eval_data", default=False, help="Whether to create eval data.")
FLAGS = flags.FLAGS
def _get_spm_basename():
spm_basename = os.path.basename(FLAGS.spiece_model_file)
return spm_basename
def preprocess():
"""Preprocesses SQUAD data."""
sp_model = spm.SentencePieceProcessor()
sp_model.Load(FLAGS.spiece_model_file)
spm_basename = _get_spm_basename()
if FLAGS.create_train_data:
train_rec_file = os.path.join(
FLAGS.output_dir,
"{}.{}.slen-{}.qlen-{}.train.tf_record".format(spm_basename,
FLAGS.proc_id,
FLAGS.max_seq_length,
FLAGS.max_query_length))
logging.info("Read examples from %s", FLAGS.train_file)
train_examples = squad_utils.read_squad_examples(
FLAGS.train_file, is_training=True)
train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc]
# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in the `input_fn`.
random.shuffle(train_examples)
write_to_logging = "Write to " + train_rec_file
logging.info(write_to_logging)
train_writer = squad_utils.FeatureWriter(
filename=train_rec_file, is_training=True)
squad_utils.convert_examples_to_features(
examples=train_examples,
sp_model=sp_model,
max_seq_length=FLAGS.max_seq_length,
doc_stride=FLAGS.doc_stride,
max_query_length=FLAGS.max_query_length,
is_training=True,
output_fn=train_writer.process_feature,
uncased=FLAGS.uncased)
train_writer.close()
if FLAGS.create_eval_data:
eval_examples = squad_utils.read_squad_examples(
FLAGS.predict_file, is_training=False)
eval_rec_file = os.path.join(
FLAGS.output_dir,
"{}.slen-{}.qlen-{}.eval.tf_record".format(spm_basename,
FLAGS.max_seq_length,
FLAGS.max_query_length))
eval_feature_file = os.path.join(
FLAGS.output_dir,
"{}.slen-{}.qlen-{}.eval.features.pkl".format(spm_basename,
FLAGS.max_seq_length,
FLAGS.max_query_length))
eval_writer = squad_utils.FeatureWriter(
filename=eval_rec_file, is_training=False)
eval_features = []
def append_feature(feature):
eval_features.append(feature)
eval_writer.process_feature(feature)
squad_utils.convert_examples_to_features(
examples=eval_examples,
sp_model=sp_model,
max_seq_length=FLAGS.max_seq_length,
doc_stride=FLAGS.doc_stride,
max_query_length=FLAGS.max_query_length,
is_training=False,
output_fn=append_feature,
uncased=FLAGS.uncased)
eval_writer.close()
with tf.io.gfile.GFile(eval_feature_file, "wb") as fout:
pickle.dump(eval_features, fout)
def main(_):
logging.set_verbosity(logging.INFO)
if not tf.io.gfile.exists(FLAGS.output_dir):
tf.io.gfile.mkdir(FLAGS.output_dir)
preprocess()
if __name__ == "__main__":
app.run(main)
# coding=utf-8
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for pre-processing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unicodedata
import six
SPIECE_UNDERLINE = '▁'
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode('utf-8')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')
def print_(*args):
new_args = []
for arg in args:
if isinstance(arg, list):
s = [printable_text(i) for i in arg]
s = ' '.join(s)
new_args.append(s)
else:
new_args.append(printable_text(arg))
print(*new_args)
def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
"""Preprocesses texts."""
if remove_space:
outputs = ' '.join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace('``', '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode('utf-8')
if not keep_accents:
outputs = unicodedata.normalize('NFKD', outputs)
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
if lower:
outputs = outputs.lower()
return outputs
def encode_pieces(sp_model, text, return_unicode=True, sample=False):
"""Encodes pieces."""
# return_unicode is used only for py2
if six.PY2 and isinstance(text, unicode):
text = text.encode('utf-8')
if not sample:
pieces = sp_model.EncodeAsPieces(text)
else:
pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
cur_pieces = sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode('utf-8')
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces
def encode_ids(sp_model, text, sample=False):
pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)
ids = [sp_model.PieceToId(piece) for piece in pieces]
return ids
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""XLNet classification finetuning runner in tf2.0."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import functools
from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
# pylint: disable=unused-import
# Initialize TPU System.
from official.nlp import xlnet_config
from official.nlp import xlnet_modeling as modeling
from official.nlp.xlnet import common_flags
from official.nlp.xlnet import data_utils
from official.nlp.xlnet import optimization
from official.nlp.xlnet import training_utils
flags.DEFINE_integer("n_class", default=2, help="Number of classes.")
FLAGS = flags.FLAGS
def get_classificationxlnet_model(model_config, run_config, n_class):
model = modeling.ClassificationXLNetModel(
model_config, run_config, n_class, name="model")
return model
def run_evaluation(strategy,
test_input_fn,
eval_steps,
model,
step,
eval_summary_writer=None):
"""Run evaluation for classification task.
Args:
strategy: distribution strategy.
test_input_fn: input function for evaluation data.
eval_steps: total number of evaluation steps.
model: keras model object.
step: current train step.
eval_summary_writer: summary writer used to record evaluation metrics. As
there are fake data samples in validation set, we use mask to get rid of
them when calculating the accuracy. For the reason that there will be
dynamic-shape tensor, we first collect logits, labels and masks from TPU
and calculate the accuracy via numpy locally.
"""
def _test_step_fn(inputs):
"""Replicated validation step."""
inputs["mems"] = None
_, logits = model(inputs, training=False)
return logits, inputs["label_ids"], inputs["is_real_example"]
@tf.function
def _run_evaluation(test_iterator):
"""Runs validation steps."""
logits, labels, masks = strategy.experimental_run_v2(
_test_step_fn, args=(next(test_iterator),))
return logits, labels, masks
# pylint: disable=protected-access
test_iterator = data_utils._get_input_iterator(test_input_fn, strategy)
# pylint: enable=protected-access
correct = 0
total = 0
for _ in range(eval_steps):
logits, labels, masks = _run_evaluation(test_iterator)
logits = strategy.experimental_local_results(logits)
labels = strategy.experimental_local_results(labels)
masks = strategy.experimental_local_results(masks)
merged_logits = []
merged_labels = []
merged_masks = []
for i in range(strategy.num_replicas_in_sync):
merged_logits.append(logits[i].numpy())
merged_labels.append(labels[i].numpy())
merged_masks.append(masks[i].numpy())
merged_logits = np.vstack(np.array(merged_logits))
merged_labels = np.hstack(np.array(merged_labels))
merged_masks = np.hstack(np.array(merged_masks))
real_index = np.where(np.equal(merged_masks, 1))
correct += np.sum(
np.equal(
np.argmax(merged_logits[real_index], axis=-1),
merged_labels[real_index]))
total += np.shape(real_index)[-1]
logging.info("Train step: %d / acc = %d/%d = %f", step, correct, total,
float(correct) / float(total))
if eval_summary_writer:
with eval_summary_writer.as_default():
tf.summary.scalar("eval_acc", float(correct) / float(total), step=step)
eval_summary_writer.flush()
def get_metric_fn():
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
"acc", dtype=tf.float32)
return train_acc_metric
def get_primary_cpu_task(use_remote_tpu=False):
"""Returns primary CPU task to which input pipeline Ops are put."""
# Remote Eager Borg job configures the TPU worker with job name 'worker'.
return "/job:worker" if use_remote_tpu else ""
def main(unused_argv):
del unused_argv
use_remote_tpu = False
if FLAGS.strategy_type == "mirror":
strategy = tf.distribute.MirroredStrategy()
elif FLAGS.strategy_type == "tpu":
cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
use_remote_tpu = True
else:
raise ValueError("The distribution strategy type is not supported: %s" %
FLAGS.strategy_type)
if strategy:
logging.info("***** Number of cores used : %d",
strategy.num_replicas_in_sync)
train_input_fn = functools.partial(data_utils.get_classification_input_data,
FLAGS.train_batch_size, FLAGS.seq_len,
strategy, True, FLAGS.train_tfrecord_path)
test_input_fn = functools.partial(data_utils.get_classification_input_data,
FLAGS.test_batch_size, FLAGS.seq_len,
strategy, False, FLAGS.test_tfrecord_path)
total_training_steps = FLAGS.train_steps
steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
steps_per_loop = FLAGS.iterations
eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size)
eval_fn = functools.partial(run_evaluation, strategy, test_input_fn,
eval_steps)
optimizer, learning_rate_fn = optimization.create_optimizer(
FLAGS.learning_rate,
total_training_steps,
FLAGS.warmup_steps,
adam_epsilon=FLAGS.adam_epsilon)
model_config = xlnet_config.XLNetConfig(FLAGS)
run_config = xlnet_config.create_run_config(True, False, FLAGS)
model_fn = functools.partial(get_classificationxlnet_model, model_config,
run_config, FLAGS.n_class)
input_meta_data = {}
input_meta_data["d_model"] = FLAGS.d_model
input_meta_data["mem_len"] = FLAGS.mem_len
input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
strategy.num_replicas_in_sync)
input_meta_data["n_layer"] = FLAGS.n_layer
input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
input_meta_data["n_class"] = FLAGS.n_class
print("DEBUG: ", str(input_meta_data))
def logits_init_fn():
return tf.zeros(
shape=(input_meta_data["batch_size_per_core"],
input_meta_data["n_class"]),
dtype=tf.float32)
with tf.device(get_primary_cpu_task(use_remote_tpu)):
training_utils.train(
strategy=strategy,
model_fn=model_fn,
input_meta_data=input_meta_data,
eval_fn=eval_fn,
metric_fn=get_metric_fn,
logits_init_fn=logits_init_fn,
train_input_fn=train_input_fn,
test_input_fn=test_input_fn,
init_checkpoint=FLAGS.init_checkpoint,
total_training_steps=total_training_steps,
steps_per_epoch=steps_per_epoch,
steps_per_loop=steps_per_loop,
optimizer=optimizer,
learning_rate_fn=learning_rate_fn,
model_dir=FLAGS.model_dir)
if __name__ == "__main__":
assert tf.version.VERSION.startswith('2.')
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""XLNet classification finetuning runner in tf2.0."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import functools
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: disable=unused-import
# Initialize TPU System.
from official.nlp import xlnet_config
from official.nlp import xlnet_modeling as modeling
from official.nlp.xlnet import common_flags
from official.nlp.xlnet import data_utils
from official.nlp.xlnet import optimization
from official.nlp.xlnet import training_utils
flags.DEFINE_integer(
"mask_alpha", default=6, help="How many tokens to form a group.")
flags.DEFINE_integer(
"mask_beta", default=1, help="How many tokens to mask within each group.")
flags.DEFINE_integer(
"num_predict",
default=None,
help="Number of tokens to predict in partial prediction.")
flags.DEFINE_integer("perm_size", 0, help="Window size of permutation.")
FLAGS = flags.FLAGS
def get_pretrainxlnet_model(model_config, run_config):
model = modeling.PretrainingXLNetModel(model_config, run_config, name="model")
return model
def get_primary_cpu_task(use_remote_tpu=False):
"""Returns primary CPU task to which input pipeline Ops are put."""
# Remote Eager Borg job configures the TPU worker with job name 'worker'.
return "/job:worker" if use_remote_tpu else ""
def main(unused_argv):
del unused_argv
use_remote_tpu = False
num_hosts = 1
if FLAGS.strategy_type == "mirror":
strategy = tf.distribute.MirroredStrategy()
elif FLAGS.strategy_type == "tpu":
cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
use_remote_tpu = True
topology = FLAGS.tpu_topology.split("x")
total_num_core = 2 * int(topology[0]) * int(topology[1])
num_hosts = total_num_core // FLAGS.num_core_per_host
else:
raise ValueError("The distribution strategy type is not supported: %s" %
FLAGS.strategy_type)
if strategy:
logging.info("***** Number of cores used : %d",
strategy.num_replicas_in_sync)
logging.info("***** Number of hosts used : %d",
num_hosts)
train_input_fn = functools.partial(
data_utils.get_pretrain_input_data, FLAGS.train_batch_size, FLAGS.seq_len,
strategy, FLAGS.train_tfrecord_path, FLAGS.reuse_len, FLAGS.perm_size,
FLAGS.mask_alpha, FLAGS.mask_beta, FLAGS.num_predict, FLAGS.bi_data,
FLAGS.uncased, num_hosts)
total_training_steps = FLAGS.train_steps
steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
steps_per_loop = FLAGS.iterations
optimizer, learning_rate_fn = optimization.create_optimizer(
init_lr=FLAGS.learning_rate,
num_train_steps=total_training_steps,
num_warmup_steps=FLAGS.warmup_steps,
min_lr_ratio=FLAGS.min_lr_ratio,
adam_epsilon=FLAGS.adam_epsilon,
weight_decay_rate=FLAGS.weight_decay_rate)
model_config = xlnet_config.XLNetConfig(FLAGS)
run_config = xlnet_config.create_run_config(True, False, FLAGS)
input_meta_data = {}
input_meta_data["d_model"] = FLAGS.d_model
input_meta_data["mem_len"] = FLAGS.mem_len
input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
strategy.num_replicas_in_sync)
input_meta_data["n_layer"] = FLAGS.n_layer
input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
model_fn = functools.partial(get_pretrainxlnet_model, model_config,
run_config)
def logits_init_fn():
return tf.zeros(
shape=(FLAGS.num_predict, input_meta_data["batch_size_per_core"],
FLAGS.d_model),
dtype=tf.float32)
with tf.device(get_primary_cpu_task(use_remote_tpu)):
training_utils.train(
strategy=strategy,
model_fn=model_fn,
input_meta_data=input_meta_data,
eval_fn=None,
metric_fn=None,
logits_init_fn=logits_init_fn,
train_input_fn=train_input_fn,
test_input_fn=None,
init_checkpoint=FLAGS.init_checkpoint,
total_training_steps=total_training_steps,
steps_per_epoch=steps_per_epoch,
steps_per_loop=steps_per_loop,
optimizer=optimizer,
learning_rate_fn=learning_rate_fn,
model_dir=FLAGS.model_dir,
save_steps=FLAGS.save_steps)
if __name__ == "__main__":
assert tf.version.VERSION.startswith('2.')
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""XLNet SQUAD finetuning runner in tf2.0."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import functools
import json
import os
import pickle
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: disable=unused-import
# Initialize TPU System.
from official.nlp import xlnet_config
from official.nlp import xlnet_modeling as modeling
from official.nlp.xlnet import common_flags
from official.nlp.xlnet import data_utils
from official.nlp.xlnet import optimization
from official.nlp.xlnet import squad_utils
from official.nlp.xlnet import training_utils
flags.DEFINE_string(
"test_feature_path", default=None, help="Path to feature of test set.")
flags.DEFINE_integer("query_len", default=64, help="Max query length.")
flags.DEFINE_integer("start_n_top", default=5, help="Beam size for span start.")
flags.DEFINE_integer("end_n_top", default=5, help="Beam size for span end.")
flags.DEFINE_string(
"predict_dir", default=None, help="Path to write predictions.")
flags.DEFINE_string(
"predict_file", default=None, help="Path to json file of test set.")
flags.DEFINE_integer(
"n_best_size", default=5, help="n best size for predictions.")
flags.DEFINE_integer("max_answer_length", default=64, help="Max answer length.")
FLAGS = flags.FLAGS
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tok_start_to_orig_index,
tok_end_to_orig_index,
token_is_max_context,
input_ids,
input_mask,
p_mask,
segment_ids,
paragraph_len,
cls_index,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tok_start_to_orig_index = tok_start_to_orig_index
self.tok_end_to_orig_index = tok_end_to_orig_index
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.p_mask = p_mask
self.segment_ids = segment_ids
self.paragraph_len = paragraph_len
self.cls_index = cls_index
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def get_primary_cpu_task(use_remote_tpu=False):
"""Returns primary CPU task to which input pipeline Ops are put."""
# Remote Eager Borg job configures the TPU worker with job name 'worker'.
return "/job:worker" if use_remote_tpu else ""
# pylint: disable=unused-argument
def run_evaluation(strategy,
test_input_fn,
eval_steps,
input_meta_data,
model,
step,
eval_summary_writer=None):
"""Run evaluation for SQUAD task.
Args:
strategy: distribution strategy.
test_input_fn: input function for evaluation data.
eval_steps: total number of evaluation steps.
input_meta_data: input meta data.
model: keras model object.
step: current training step.
eval_summary_writer: summary writer used to record evaluation metrics.
"""
def _test_step_fn(inputs):
"""Replicated validation step."""
inputs["mems"] = None
res = model(inputs, training=False)
return res, inputs["unique_ids"]
@tf.function
def _run_evaluation(test_iterator):
"""Runs validation steps."""
res, unique_ids = strategy.experimental_run_v2(
_test_step_fn, args=(next(test_iterator),))
return res, unique_ids
# pylint: disable=protected-access
test_iterator = data_utils._get_input_iterator(test_input_fn, strategy)
# pylint: enable=protected-access
cur_results = []
eval_examples = squad_utils.read_squad_examples(
input_meta_data["predict_file"], is_training=False)
with tf.io.gfile.GFile(input_meta_data["predict_file"]) as f:
orig_data = json.load(f)["data"]
for _ in range(eval_steps):
results, unique_ids = _run_evaluation(test_iterator)
unique_ids = strategy.experimental_local_results(unique_ids)
for result_key in results:
results[result_key] = (
strategy.experimental_local_results(results[result_key]))
for core_i in range(strategy.num_replicas_in_sync):
bsz = int(input_meta_data["test_batch_size"] /
strategy.num_replicas_in_sync)
for j in range(bsz):
result = {}
for result_key in results:
result[result_key] = results[result_key][core_i].numpy()[j]
result["unique_ids"] = unique_ids[core_i].numpy()[j]
# We appended a fake example into dev set to make data size can be
# divided by test_batch_size. Ignores this fake example during
# evaluation.
if result["unique_ids"] == 1000012047:
continue
unique_id = int(result["unique_ids"])
start_top_log_probs = ([
float(x) for x in result["start_top_log_probs"].flat
])
start_top_index = [int(x) for x in result["start_top_index"].flat]
end_top_log_probs = ([
float(x) for x in result["end_top_log_probs"].flat
])
end_top_index = [int(x) for x in result["end_top_index"].flat]
cls_logits = float(result["cls_logits"].flat[0])
cur_results.append(
squad_utils.RawResult(
unique_id=unique_id,
start_top_log_probs=start_top_log_probs,
start_top_index=start_top_index,
end_top_log_probs=end_top_log_probs,
end_top_index=end_top_index,
cls_logits=cls_logits))
if len(cur_results) % 1000 == 0:
logging.info("Processing example: %d", len(cur_results))
output_prediction_file = os.path.join(input_meta_data["predict_dir"],
"predictions.json")
output_nbest_file = os.path.join(input_meta_data["predict_dir"],
"nbest_predictions.json")
output_null_log_odds_file = os.path.join(input_meta_data["predict_dir"],
"null_odds.json")
ret = squad_utils.write_predictions(
eval_examples, input_meta_data["eval_features"], cur_results,
input_meta_data["n_best_size"], input_meta_data["max_answer_length"],
output_prediction_file, output_nbest_file, output_null_log_odds_file,
orig_data, input_meta_data["start_n_top"], input_meta_data["end_n_top"])
# Log current result
log_str = "Result | "
for key, val in ret.items():
log_str += "{} {} | ".format(key, val)
logging.info(log_str)
if eval_summary_writer:
with eval_summary_writer.as_default():
tf.summary.scalar("best_f1", ret["best_f1"], step=step)
tf.summary.scalar("best_exact", ret["best_exact"], step=step)
eval_summary_writer.flush()
def get_qaxlnet_model(model_config, run_config, start_n_top, end_n_top):
model = modeling.QAXLNetModel(
model_config,
run_config,
start_n_top=start_n_top,
end_n_top=end_n_top,
name="model")
return model
def main(unused_argv):
del unused_argv
use_remote_tpu = False
if FLAGS.strategy_type == "mirror":
strategy = tf.distribute.MirroredStrategy()
elif FLAGS.strategy_type == "tpu":
cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
use_remote_tpu = True
else:
raise ValueError("The distribution strategy type is not supported: %s" %
FLAGS.strategy_type)
if strategy:
logging.info("***** Number of cores used : %d",
strategy.num_replicas_in_sync)
train_input_fn = functools.partial(data_utils.get_squad_input_data,
FLAGS.train_batch_size, FLAGS.seq_len,
FLAGS.query_len, strategy, True,
FLAGS.train_tfrecord_path)
test_input_fn = functools.partial(data_utils.get_squad_input_data,
FLAGS.test_batch_size, FLAGS.seq_len,
FLAGS.query_len, strategy, False,
FLAGS.test_tfrecord_path)
total_training_steps = FLAGS.train_steps
steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
steps_per_loop = FLAGS.iterations
eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size)
optimizer, learning_rate_fn = optimization.create_optimizer(
FLAGS.learning_rate,
total_training_steps,
FLAGS.warmup_steps,
adam_epsilon=FLAGS.adam_epsilon)
model_config = xlnet_config.XLNetConfig(FLAGS)
run_config = xlnet_config.create_run_config(True, False, FLAGS)
input_meta_data = {}
input_meta_data["start_n_top"] = FLAGS.start_n_top
input_meta_data["end_n_top"] = FLAGS.end_n_top
input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
input_meta_data["predict_dir"] = FLAGS.predict_dir
input_meta_data["predict_file"] = FLAGS.predict_file
input_meta_data["n_best_size"] = FLAGS.n_best_size
input_meta_data["max_answer_length"] = FLAGS.max_answer_length
input_meta_data["test_feature_path"] = FLAGS.test_feature_path
input_meta_data["test_batch_size"] = FLAGS.test_batch_size
input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
strategy.num_replicas_in_sync)
input_meta_data["mem_len"] = FLAGS.mem_len
model_fn = functools.partial(get_qaxlnet_model, model_config, run_config,
FLAGS.start_n_top, FLAGS.end_n_top)
def logits_init_fn():
return tf.zeros(
shape=(input_meta_data["batch_size_per_core"]), dtype=tf.float32)
logging.info("start reading pickle file...")
with tf.io.gfile.GFile(input_meta_data["test_feature_path"], "rb") as f:
eval_features = pickle.load(f)
logging.info("finishing reading pickle file...")
input_meta_data["eval_features"] = eval_features
eval_fn = functools.partial(run_evaluation, strategy, test_input_fn,
eval_steps, input_meta_data)
with tf.device(get_primary_cpu_task(use_remote_tpu)):
training_utils.train(
strategy=strategy,
model_fn=model_fn,
input_meta_data=input_meta_data,
eval_fn=eval_fn,
metric_fn=None,
logits_init_fn=logits_init_fn,
train_input_fn=train_input_fn,
test_input_fn=test_input_fn,
init_checkpoint=FLAGS.init_checkpoint,
total_training_steps=total_training_steps,
steps_per_epoch=steps_per_epoch,
steps_per_loop=steps_per_loop,
optimizer=optimizer,
learning_rate_fn=learning_rate_fn,
model_dir=FLAGS.model_dir)
if __name__ == "__main__":
assert tf.version.VERSION.startswith('2.')
app.run(main)
This diff is collapsed.
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""XLNet classification finetuning runner in tf2.0."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import os
import re
from absl import logging
# pytype: disable=attribute-error
# pylint: disable=g-bare-generic,unused-import
import tensorflow as tf
# Initialize TPU System.
from official.nlp.xlnet import data_utils
from official.nlp import xlnet_modeling as modeling
from typing import Any, Callable, Dict, Text, Optional
_MIN_SUMMARY_STEPS = 10
def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
"""Saves model to with provided checkpoint prefix."""
checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
saved_path = checkpoint.save(checkpoint_path)
logging.info("Saving model as TF checkpoint: %s", saved_path)
return
def _float_metric_value(metric):
"""Gets the value of a float-value keras metric."""
return metric.result().numpy().astype(float)
def _steps_to_run(current_step, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError("steps_per_loop should be positive integer.")
if steps_per_loop == 1:
return steps_per_loop
remainder_in_epoch = current_step % steps_per_epoch
if remainder_in_epoch != 0:
return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
else:
return steps_per_loop
def train(
strategy: tf.distribute.Strategy,
model_fn: Callable,
input_meta_data: Dict,
logits_init_fn: Callable[[], tf.Tensor],
train_input_fn: Callable,
total_training_steps: int,
steps_per_epoch: int,
steps_per_loop: int,
optimizer: tf.keras.optimizers.Optimizer,
learning_rate_fn: tf.keras.optimizers.schedules.LearningRateSchedule,
eval_fn: Optional[Callable[[tf.keras.Model, int, tf.summary.SummaryWriter],
Any]] = None,
metric_fn: Optional[Callable[[], tf.keras.metrics.Metric]] = None,
test_input_fn: Optional[Callable] = None,
init_checkpoint: Optional[Text] = None,
model_dir: Optional[Text] = None,
save_steps: Optional[int] = None):
"""Runs customized training.
Args:
strategy: Distribution strategy on which to run low level training loop.
model_fn: The function returns a keras.Model.
input_meta_data: A dictionary of params: `mem_len`, `lr_layer_decay_rate`,
`n_layer`, `batch_size_per_core` and `d_model`.
logits_init_fn: Function creates a dummy logits tensor.
train_input_fn: Function returns a tf.data.Dataset used for training.
total_training_steps: Number of steps to train in total.
steps_per_epoch: Number of steps to run per epoch. At the end of each
epoch, model checkpoint will be saved and evaluation will be conducted
if evaluation dataset is provided.
steps_per_loop: Number of steps per graph-mode loop. In order to reduce
communication in eager context, training logs are printed every
steps_per_loop.
optimizer: The optimizer for model.
learning_rate_fn: the learning rate schedule.
eval_fn: A callback of evaluation function, that takes a keras.Model,
current step and evaluation summary writer.
metric_fn: A metrics function returns a Keras Metric object to record
evaluation result using evaluation dataset or with training dataset
after every epoch.
test_input_fn: Function returns a evaluation dataset. If none, evaluation
is skipped.
init_checkpoint: Optional checkpoint to load to `sub_model` returned by
`model_fn`.
model_dir: The directory of model (checkpoints, summaries).
save_steps: The frequency to save checkpoints. Every save_steps, we save a
model checkpoint.
Returns:
Last training step logits if training happens, otherwise returns None.
Raises:
TypeError: if model directory is not specified.
"""
required_arguments = [
logits_init_fn, train_input_fn, total_training_steps, steps_per_epoch,
steps_per_loop, optimizer, learning_rate_fn
]
if [arg for arg in required_arguments if arg is None]:
raise ValueError(
"`logits_init_fn`, `train_input_fn`, `total_training_steps`, "
"`steps_per_epoch`, `steps_per_loop`, `optimizer` and "
"`learning_rate_fn` are required parameters.")
if not model_dir:
raise TypeError("Model directory must be specified.")
# pylint: disable=protected-access
train_iterator = data_utils._get_input_iterator(train_input_fn, strategy)
# pylint: enable=protected-access
train_summary_writer = None
eval_summary_writer = None
if not tf.io.gfile.exists(model_dir):
tf.io.gfile.mkdir(model_dir)
if test_input_fn:
eval_summary_writer = tf.summary.create_file_writer(
os.path.join(model_dir, "summaries/eval"))
if steps_per_loop >= _MIN_SUMMARY_STEPS:
# Only writes summary when the stats are collected sufficiently over
# enough steps.
train_summary_writer = tf.summary.create_file_writer(
os.path.join(model_dir, "summaries/train"))
with strategy.scope():
model = model_fn()
if init_checkpoint:
logging.info("restore from %s", init_checkpoint)
checkpoint = tf.train.Checkpoint(model=model)
checkpoint.restore(init_checkpoint)
model.optimizer = optimizer
if not hasattr(model, "optimizer"):
raise ValueError("User should set optimizer attribute to model.")
train_loss_metric = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
train_metric = None
if metric_fn:
train_metric = metric_fn()
def _replicated_step(inputs, mem=None):
"""Replicated training step."""
inputs["mems"] = mem
with tf.GradientTape() as tape:
mem, logits = model(inputs, training=True)
loss = model.losses
train_loss_metric.update_state(loss)
if train_metric:
train_metric.update_state(inputs["label_ids"], logits)
scaled_loss = loss[0] * 1.0 / float(strategy.num_replicas_in_sync)
# Collects training variables.
tvars = model.trainable_variables
grads = tape.gradient(scaled_loss, tvars)
clipped, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
if input_meta_data["lr_layer_decay_rate"] != 1.0:
n_layer = 0
for i in range(len(clipped)):
m = re.search(r"model/transformer/layer_(\d+?)/", tvars[i].name)
if not m:
continue
n_layer = max(n_layer, int(m.group(1)) + 1)
for i in range(len(clipped)):
for l in range(n_layer):
if "model/transformer/layer_{}/".format(l) in tvars[i].name:
abs_rate = input_meta_data["lr_layer_decay_rate"]**(
n_layer - 1 - l)
clipped[i] *= abs_rate
logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(
abs_rate, l, tvars[i].name))
break
optimizer.apply_gradients(zip(clipped, tvars))
if input_meta_data["mem_len"] > 0:
return mem, logits
else:
return logits
@tf.function
def train_steps(iterator, steps):
"""Performs distributed training steps in a loop.
Args:
iterator: the distributed iterator of training datasets.
steps: an tf.int32 integer tensor to specify number of steps to run
inside host training loop.
Raises:
ValueError: Any of the arguments or tensor shapes are invalid.
Returns:
logits: logits computed.
"""
if not isinstance(steps, tf.Tensor):
raise ValueError("steps should be an Tensor. Python object may cause "
"retracing.")
def cache_fn():
"""Initializes memory tensor used in XLNet pretraining."""
mems = []
if input_meta_data["mem_len"] > 0:
for _ in range(input_meta_data["n_layer"]):
zeros = tf.zeros([
input_meta_data["mem_len"],
input_meta_data["batch_size_per_core"],
input_meta_data["d_model"]
],
dtype=tf.float32)
mems.append(zeros)
return mems
logits = strategy.experimental_run_v2(logits_init_fn)
if input_meta_data["mem_len"] > 0:
mem = strategy.experimental_run_v2(cache_fn)
for _ in tf.range(steps):
mem, logits = strategy.experimental_run_v2(
_replicated_step, args=(
next(iterator),
mem,
))
else:
for _ in tf.range(steps):
logits = strategy.experimental_run_v2(
_replicated_step, args=(next(iterator),))
return logits
logging.info("Start training...")
checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
if latest_checkpoint_file:
logging.info("Checkpoint file %s found and restoring from checkpoint",
latest_checkpoint_file)
checkpoint.restore(latest_checkpoint_file)
logging.info("Loading from checkpoint file completed")
current_step = optimizer.iterations.numpy()
checkpoint_name = "xlnet_step_{step}.ckpt"
logits = None
while current_step < total_training_steps:
train_loss_metric.reset_states()
if train_metric:
train_metric.reset_states()
steps = _steps_to_run(current_step, steps_per_epoch, steps_per_loop)
logits = train_steps(train_iterator,
tf.convert_to_tensor(steps, dtype=tf.int32))
current_step += steps
train_loss = _float_metric_value(train_loss_metric)
log_stream = "Train step: %d/%d / lr = %.9f / loss = %.7f" % (
current_step, total_training_steps, learning_rate_fn(current_step),
train_loss)
if train_metric:
log_stream += " / %s = %f" % (train_metric.name,
_float_metric_value(train_metric))
logging.info(log_stream)
if train_summary_writer:
with train_summary_writer.as_default():
tf.summary.scalar(
"learning_rate",
learning_rate_fn(current_step),
step=current_step)
tf.summary.scalar(
train_loss_metric.name, train_loss, step=current_step)
if train_metric:
tf.summary.scalar(
train_metric.name,
_float_metric_value(train_metric),
step=current_step)
train_summary_writer.flush()
if model_dir:
if (save_steps is None) or (save_steps and
current_step % save_steps == 0):
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if test_input_fn and current_step % steps_per_epoch == 0:
logging.info("Running evaluation after step: %s.", current_step)
eval_fn(model, current_step, eval_summary_writer)
if model_dir:
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if test_input_fn:
logging.info("Running final evaluation after training is complete.")
eval_fn(model, current_step, eval_summary_writer)
return logits
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility functions used in XLNet model."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import json
import os
import tensorflow as tf
def create_run_config(is_training, is_finetune, flags):
"""Helper function for creating RunConfig."""
kwargs = dict(
is_training=is_training,
use_tpu=flags.use_tpu,
use_bfloat16=flags.use_bfloat16,
dropout=flags.dropout,
dropout_att=flags.dropout_att,
init_method=flags.init_method,
init_range=flags.init_range,
init_std=flags.init_std,
clamp_len=flags.clamp_len)
if not is_finetune:
kwargs.update(dict(
mem_len=flags.mem_len,
reuse_len=flags.reuse_len,
bi_data=flags.bi_data,
clamp_len=flags.clamp_len,
same_length=flags.same_length))
return RunConfig(**kwargs)
class XLNetConfig(object):
"""Configs for XLNet model.
XLNetConfig contains hyperparameters that are specific to a model checkpoint;
i.e., these hyperparameters should be the same between
pretraining and finetuning.
The following hyperparameters are defined:
n_layer: int, the number of layers.
d_model: int, the hidden size.
n_head: int, the number of attention heads.
d_head: int, the dimension size of each attention head.
d_inner: int, the hidden size in feed-forward layers.
ff_activation: str, "relu" or "gelu".
untie_r: bool, whether to untie the biases in attention.
n_token: int, the vocab size.
"""
def __init__(self, FLAGS=None, json_path=None, args_dict=None):
"""Constructing an XLNetConfig.
One of FLAGS or json_path should be provided.
Args:
FLAGS: An FLAGS instance.
json_path: A path to a json config file.
args_dict: A dict for args.
"""
assert FLAGS is not None or json_path is not None or args_dict is not None
self.keys = ['n_layer', 'd_model', 'n_head', 'd_head', 'd_inner',
'ff_activation', 'untie_r', 'n_token']
if FLAGS is not None:
self.init_from_flags(FLAGS)
if json_path is not None:
self.init_from_json(json_path)
if args_dict is not None:
self.init_from_dict(args_dict)
def init_from_dict(self, args_dict):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
for key in self.keys:
setattr(self, key, args_dict[key])
def init_from_flags(self, flags):
for key in self.keys:
setattr(self, key, getattr(flags, key))
def init_from_json(self, json_path):
with tf.gfile.Open(json_path) as f:
json_data = json.load(f)
self.init_from_dict(json_data)
def to_json(self, json_path):
"""Save XLNetConfig to a json file."""
json_data = {}
for key in self.keys:
json_data[key] = getattr(self, key)
json_dir = os.path.dirname(json_path)
if not tf.gfile.Exists(json_dir):
tf.gfile.MakeDirs(json_dir)
with tf.gfile.Open(json_path, 'w') as f:
json.dump(json_data, f, indent=4, sort_keys=True)
class RunConfig(object):
"""Class of RunConfig.
RunConfig contains hyperparameters that could be different
between pretraining and finetuning.
These hyperparameters can also be changed from run to run.
We store them separately from XLNetConfig for flexibility.
"""
def __init__(self,
is_training,
use_tpu,
use_bfloat16,
dropout,
dropout_att,
init_method='normal',
init_range=0.1,
init_std=0.02,
mem_len=None,
reuse_len=None,
bi_data=False,
clamp_len=-1,
same_length=False):
"""Initializes RunConfig.
Args:
is_training: bool, whether in training mode.
use_tpu: bool, whether TPUs are used.
use_bfloat16: bool, use bfloat16 instead of float32.
dropout: float, dropout rate.
dropout_att: float, dropout rate on attention probabilities.
init_method: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length
for each token.
"""
self.init_method = init_method
self.init_range = init_range
self.init_std = init_std
self.is_training = is_training
self.dropout = dropout
self.dropout_att = dropout_att
self.use_tpu = use_tpu
self.use_bfloat16 = use_bfloat16
self.mem_len = mem_len
self.reuse_len = reuse_len
self.bi_data = bi_data
self.clamp_len = clamp_len
self.same_length = same_length
This diff is collapsed.
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import logging
import numpy as np
import tensorflow as tf
from official.nlp import xlnet_modeling
class PositionalEmbeddingLayerTest(tf.test.TestCase):
def test_positional_embedding(self):
"""A low-dimensional example is tested.
With len(pos_seq)=2 and d_model=4:
pos_seq = [[1.], [0.]]
inv_freq = [1., 0.01]
pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
[sin(0.), sin(0.), cos(0.), cos(0.)]]
= [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
[0., 0., 1., 1.]]
"""
target = np.array([[[0.84147096, 0.00999983, 0.54030228, 0.99994999]],
[[0., 0., 1., 1.]]])
d_model = 4
pos_seq = tf.range(1, -1, -1.0) # [1., 0.]
pos_emb_layer = xlnet_modeling.PositionalEmbedding(d_model)
pos_emb = pos_emb_layer(
pos_seq=pos_seq, batch_size=None).numpy().astype(float)
logging.info(pos_emb)
self.assertAllClose(pos_emb, target)
if __name__ == "__main__":
assert tf.version.VERSION.startswith('2.')
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment