Internal change

PiperOrigin-RevId: 268543563

Internal change
PiperOrigin-RevId: 268543563
35112d1c · Hongkun Yu · saberkun · 1577ed07 · 35112d1c · 35112d1c
Commit 35112d1c authored Sep 11, 2019 by Hongkun Yu Committed by saberkun Sep 11, 2019
19 changed files
--- a/official/nlp/README.md
+++ b/official/nlp/README.md
+# TensorFlow Natural Language Processing Models
+tensorflow/models/official/nlp is a library of state-of-the-art models for
+Natural Language Processing (NLP).
+The library currently contains TensorFlow 2.x implementations, pre-trained
+model weights, usage scripts and conversion utilities for the following models:
+* Bert
+* [XLNet](xlnet)
+* Transformer for translation
--- a/official/nlp/xlnet/README.md
+++ b/official/nlp/xlnet/README.md
+# XLNet: Generalized Autoregressive Pretraining for Language Understanding
+The academic paper which describes XLNet in detail and provides full results on
+a number of tasks can be found here: https://arxiv.org/abs/1906.08237.
+Instructions and user guide will be added soon.
--- a/official/nlp/xlnet/__init__.py
+++ b/official/nlp/xlnet/__init__.py
--- a/official/nlp/xlnet/classifier_utils.py
+++ b/official/nlp/xlnet/classifier_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for pre-processing classification data."""
+from absl import flags
+from absl import logging
+from official.nlp.xlnet import data_utils
+FLAGS = flags.FLAGS
+SEG_ID_A = 0
+SEG_ID_B = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+class PaddingInputExample(object):
+  """Fake example so the num input examples is a multiple of the batch size.
+  When running eval/predict on the TPU, we need to pad the number of examples
+  to be a multiple of the batch size, because the TPU requires a fixed batch
+  size. The alternative is to drop the last batch, which is bad because it means
+  the entire output data won't be generated.
+  We use this class instead of `None` because treating `None` as padding
+  battches could cause silent errors.
+  """
+class InputFeatures(object):
+  """A single set of features of data."""
+  def __init__(self,
+               input_ids,
+               input_mask,
+               segment_ids,
+               label_id,
+               is_real_example=True):
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.label_id = label_id
+    self.is_real_example = is_real_example
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenize_fn):
+  """Converts a single `InputExample` into a single `InputFeatures`."""
+  if isinstance(example, PaddingInputExample):
+    return InputFeatures(
+        input_ids=[0] * max_seq_length,
+        input_mask=[1] * max_seq_length,
+        segment_ids=[0] * max_seq_length,
+        label_id=0,
+        is_real_example=False)
+  if label_list is not None:
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+      label_map[label] = i
+  tokens_a = tokenize_fn(example.text_a)
+  tokens_b = None
+  if example.text_b:
+    tokens_b = tokenize_fn(example.text_b)
+  if tokens_b:
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for two [SEP] & one [CLS] with "- 3"
+    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+  else:
+    # Account for one [SEP] & one [CLS] with "- 2"
+    if len(tokens_a) > max_seq_length - 2:
+      tokens_a = tokens_a[:max_seq_length - 2]
+  tokens = []
+  segment_ids = []
+  for token in tokens_a:
+    tokens.append(token)
+    segment_ids.append(SEG_ID_A)
+  tokens.append(data_utils.SEP_ID)
+  segment_ids.append(SEG_ID_A)
+  if tokens_b:
+    for token in tokens_b:
+      tokens.append(token)
+      segment_ids.append(SEG_ID_B)
+    tokens.append(data_utils.SEP_ID)
+    segment_ids.append(SEG_ID_B)
+  tokens.append(data_utils.CLS_ID)
+  segment_ids.append(SEG_ID_CLS)
+  input_ids = tokens
+  # The mask has 0 for real tokens and 1 for padding tokens. Only real
+  # tokens are attended to.
+  input_mask = [0] * len(input_ids)
+  # Zero-pad up to the sequence length.
+  if len(input_ids) < max_seq_length:
+    delta_len = max_seq_length - len(input_ids)
+    input_ids = [0] * delta_len + input_ids
+    input_mask = [1] * delta_len + input_mask
+    segment_ids = [SEG_ID_PAD] * delta_len + segment_ids
+  assert len(input_ids) == max_seq_length
+  assert len(input_mask) == max_seq_length
+  assert len(segment_ids) == max_seq_length
+  if label_list is not None:
+    label_id = label_map[example.label]
+  else:
+    label_id = example.label
+  if ex_index < 5:
+    logging.info("*** Example ***")
+    logging.info("guid: %s", (example.guid))
+    logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+    logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+    logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+    logging.info("label: %d (id = %d)", example.label, label_id)
+  feature = InputFeatures(
+      input_ids=input_ids,
+      input_mask=input_mask,
+      segment_ids=segment_ids,
+      label_id=label_id)
+  return feature
--- a/official/nlp/xlnet/common_flags.py
+++ b/official/nlp/xlnet/common_flags.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common flags used in XLNet model."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from absl import flags
+flags.DEFINE_string("master", default=None, help="master")
+flags.DEFINE_string(
+    "tpu",
+    default=None,
+    help="The Cloud TPU to use for training. This should be "
+    "either the name used when creating the Cloud TPU, or a "
+    "url like grpc://ip.address.of.tpu:8470.")
+flags.DEFINE_bool(
+    "use_tpu", default=True, help="Use TPUs rather than plain CPUs.")
+flags.DEFINE_string("tpu_topology", "2x2", help="TPU topology.")
+flags.DEFINE_integer(
+    "num_core_per_host", default=8, help="number of cores per host")
+flags.DEFINE_string("model_dir", default=None, help="Estimator model_dir.")
+flags.DEFINE_string(
+    "init_checkpoint",
+    default=None,
+    help="Checkpoint path for initializing the model.")
+# Optimization config
+flags.DEFINE_float("learning_rate", default=1e-4, help="Maximum learning rate.")
+flags.DEFINE_float("clip", default=1.0, help="Gradient clipping value.")
+flags.DEFINE_float("weight_decay_rate", default=0.0, help="Weight decay rate.")
+# lr decay
+flags.DEFINE_integer(
+    "warmup_steps", default=0, help="Number of steps for linear lr warmup.")
+flags.DEFINE_float("adam_epsilon", default=1e-8, help="Adam epsilon.")
+flags.DEFINE_float(
+    "lr_layer_decay_rate",
+    default=1.0,
+    help="Top layer: lr[L] = FLAGS.learning_rate."
+    "Lower layers: lr[l-1] = lr[l] * lr_layer_decay_rate.")
+flags.DEFINE_float(
+    "min_lr_ratio", default=0.0, help="Minimum ratio learning rate.")
+# Training config
+flags.DEFINE_integer(
+    "train_batch_size",
+    default=16,
+    help="Size of the train batch across all hosts.")
+flags.DEFINE_integer(
+    "train_steps", default=100000, help="Total number of training steps.")
+flags.DEFINE_integer(
+    "iterations", default=1000, help="Number of iterations per repeat loop.")
+# Data config
+flags.DEFINE_integer(
+    "seq_len", default=0, help="Sequence length for pretraining.")
+flags.DEFINE_integer(
+    "reuse_len",
+    default=0,
+    help="How many tokens to be reused in the next batch. "
+    "Could be half of `seq_len`.")
+flags.DEFINE_bool("uncased", False, help="Use uncased inputs or not.")
+flags.DEFINE_bool(
+    "bi_data",
+    default=False,
+    help="Use bidirectional data streams, "
+    "i.e., forward & backward.")
+flags.DEFINE_integer("n_token", 32000, help="Vocab size")
+# Model config
+flags.DEFINE_integer("mem_len", default=0, help="Number of steps to cache")
+flags.DEFINE_bool("same_length", default=False, help="Same length attention")
+flags.DEFINE_integer("clamp_len", default=-1, help="Clamp length")
+flags.DEFINE_integer("n_layer", default=6, help="Number of layers.")
+flags.DEFINE_integer("d_model", default=32, help="Dimension of the model.")
+flags.DEFINE_integer("d_embed", default=32, help="Dimension of the embeddings.")
+flags.DEFINE_integer("n_head", default=4, help="Number of attention heads.")
+flags.DEFINE_integer(
+    "d_head", default=8, help="Dimension of each attention head.")
+flags.DEFINE_integer(
+    "d_inner",
+    default=32,
+    help="Dimension of inner hidden size in positionwise "
+    "feed-forward.")
+flags.DEFINE_float("dropout", default=0.1, help="Dropout rate.")
+flags.DEFINE_float("dropout_att", default=0.1, help="Attention dropout rate.")
+flags.DEFINE_bool("untie_r", default=False, help="Untie r_w_bias and r_r_bias")
+flags.DEFINE_string(
+    "ff_activation",
+    default="relu",
+    help="Activation type used in position-wise feed-forward.")
+flags.DEFINE_string(
+    "strategy_type",
+    default="tpu",
+    help="Activation type used in position-wise feed-forward.")
+flags.DEFINE_bool("use_bfloat16", False, help="Whether to use bfloat16.")
+# Parameter initialization
+flags.DEFINE_enum(
+    "init_method",
+    default="normal",
+    enum_values=["normal", "uniform"],
+    help="Initialization method.")
+flags.DEFINE_float(
+    "init_std", default=0.02, help="Initialization std when init is normal.")
+flags.DEFINE_float(
+    "init_range", default=0.1, help="Initialization std when init is uniform.")
+flags.DEFINE_integer(
+    "train_data_size", default=130738, help="Number of training data samples.")
+flags.DEFINE_integer(
+    "test_data_size", default=12048, help="Number of test data samples.")
+flags.DEFINE_string(
+    "train_tfrecord_path",
+    default=None,
+    help="Path to preprocessed training set tfrecord.")
+flags.DEFINE_string(
+    "test_tfrecord_path",
+    default=None,
+    help="Path to preprocessed test set tfrecord.")
+flags.DEFINE_integer(
+    "test_batch_size",
+    default=16,
+    help="Size of the test batch across all hosts.")
+flags.DEFINE_integer(
+    "save_steps", default=None, help="Number of steps for saving checkpoint.")
+FLAGS = flags.FLAGS
--- a/official/nlp/xlnet/data_utils.py
+++ b/official/nlp/xlnet/data_utils.py
--- a/official/nlp/xlnet/optimization.py
+++ b/official/nlp/xlnet/optimization.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to optimization (weight updates)."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from absl import logging
+import tensorflow as tf
+from official.bert.optimization import AdamWeightDecay
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applys a warmup schedule on a given learning rate decay schedule."""
+  def __init__(self,
+               initial_learning_rate,
+               decay_schedule_fn,
+               warmup_steps,
+               power=1.0,
+               name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+  def __call__(self, step):
+    with tf.name_scope(self.name or "WarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: self.decay_schedule_fn(step - self.warmup_steps),
+          name=name)
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_schedule_fn": self.decay_schedule_fn,
+        "warmup_steps": self.warmup_steps,
+        "power": self.power,
+        "name": self.name
+    }
+def create_optimizer(init_lr,
+                     num_train_steps,
+                     num_warmup_steps,
+                     min_lr_ratio=0.0,
+                     adam_epsilon=1e-8,
+                     weight_decay_rate=0.0):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps - num_warmup_steps,
+      end_learning_rate=init_lr * min_lr_ratio)
+  if num_warmup_steps:
+    learning_rate_fn = WarmUp(
+        initial_learning_rate=init_lr,
+        decay_schedule_fn=learning_rate_fn,
+        warmup_steps=num_warmup_steps)
+  if weight_decay_rate > 0.0:
+    logging.info(
+        "Using AdamWeightDecay with adam_epsilon=%.9f weight_decay_rate=%.3f",
+        adam_epsilon, weight_decay_rate)
+    optimizer = AdamWeightDecay(
+        learning_rate=learning_rate_fn,
+        weight_decay_rate=weight_decay_rate,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=adam_epsilon,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
+        include_in_weight_decay=["r_s_bias", "r_r_bias", "r_w_bias"])
+  else:
+    logging.info("Using Adam with adam_epsilon=%.9f", (adam_epsilon))
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=learning_rate_fn, epsilon=adam_epsilon)
+  return optimizer, learning_rate_fn
--- a/official/nlp/xlnet/preprocess_classification_data.py
+++ b/official/nlp/xlnet/preprocess_classification_data.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to pre-process classification data into tfrecords."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import csv
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+import sentencepiece as spm
+from official.nlp.xlnet import classifier_utils
+from official.nlp.xlnet import preprocess_utils
+flags.DEFINE_bool(
+    "overwrite_data",
+    default=False,
+    help="If False, will use cached data if available.")
+flags.DEFINE_string("output_dir", default="", help="Output dir for TF records.")
+flags.DEFINE_string(
+    "spiece_model_file", default="", help="Sentence Piece model path.")
+flags.DEFINE_string("data_dir", default="", help="Directory for input data.")
+# task specific
+flags.DEFINE_string("eval_split", default="dev", help="could be dev or test")
+flags.DEFINE_string("task_name", default=None, help="Task name")
+flags.DEFINE_integer(
+    "eval_batch_size", default=64, help="batch size for evaluation")
+flags.DEFINE_integer("max_seq_length", default=128, help="Max sequence length")
+flags.DEFINE_integer(
+    "num_passes",
+    default=1,
+    help="Num passes for processing training data. "
+    "This is use to batch data without loss for TPUs.")
+flags.DEFINE_bool("uncased", default=False, help="Use uncased.")
+flags.DEFINE_bool(
+    "is_regression", default=False, help="Whether it's a regression task.")
+FLAGS = flags.FLAGS
+class InputExample(object):
+  """A single training/test example for simple sequence classification."""
+  def __init__(self, guid, text_a, text_b=None, label=None):
+    """Constructs a InputExample.
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+    self.guid = guid
+    self.text_a = text_a
+    self.text_b = text_b
+    self.label = label
+class DataProcessor(object):
+  """Base class for data converters for sequence classification data sets."""
+  def get_train_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the train set."""
+    raise NotImplementedError()
+  def get_dev_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the dev set."""
+    raise NotImplementedError()
+  def get_test_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for prediction."""
+    raise NotImplementedError()
+  def get_labels(self):
+    """Gets the list of labels for this data set."""
+    raise NotImplementedError()
+  @classmethod
+  def _read_tsv(cls, input_file, quotechar=None):
+    """Reads a tab separated value file."""
+    with tf.io.gfile.GFile(input_file, "r") as f:
+      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+      lines = []
+      for line in reader:
+        # pylint: disable=g-explicit-length-test
+        if len(line) == 0:
+          continue
+        lines.append(line)
+      return lines
+class GLUEProcessor(DataProcessor):
+  """GLUEProcessor."""
+  def __init__(self):
+    self.train_file = "train.tsv"
+    self.dev_file = "dev.tsv"
+    self.test_file = "test.tsv"
+    self.label_column = None
+    self.text_a_column = None
+    self.text_b_column = None
+    self.contains_header = True
+    self.test_text_a_column = None
+    self.test_text_b_column = None
+    self.test_contains_header = True
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, self.train_file)), "train")
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, self.dev_file)), "dev")
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    if self.test_text_a_column is None:
+      self.test_text_a_column = self.text_a_column
+    if self.test_text_b_column is None:
+      self.test_text_b_column = self.text_b_column
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, self.test_file)), "test")
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0 and self.contains_header and set_type != "test":
+        continue
+      if i == 0 and self.test_contains_header and set_type == "test":
+        continue
+      guid = "%s-%s" % (set_type, i)
+      a_column = (
+          self.text_a_column if set_type != "test" else self.test_text_a_column)
+      b_column = (
+          self.text_b_column if set_type != "test" else self.test_text_b_column)
+      # there are some incomplete lines in QNLI
+      if len(line) <= a_column:
+        logging.warning("Incomplete line, ignored.")
+        continue
+      text_a = line[a_column]
+      if b_column is not None:
+        if len(line) <= b_column:
+          logging.warning("Incomplete line, ignored.")
+          continue
+        text_b = line[b_column]
+      else:
+        text_b = None
+      if set_type == "test":
+        label = self.get_labels()[0]
+      else:
+        if len(line) <= self.label_column:
+          logging.warning("Incomplete line, ignored.")
+          continue
+        label = line[self.label_column]
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+class Yelp5Processor(DataProcessor):
+  """Yelp5Processor."""
+  def get_train_examples(self, data_dir):
+    return self._create_examples(os.path.join(data_dir, "train.csv"))
+  def get_dev_examples(self, data_dir):
+    return self._create_examples(os.path.join(data_dir, "test.csv"))
+  def get_labels(self):
+    """See base class."""
+    return ["1", "2", "3", "4", "5"]
+  def _create_examples(self, input_file):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    with tf.io.gfile.GFile(input_file) as f:
+      reader = csv.reader(f)
+      for i, line in enumerate(reader):
+        label = line[0]
+        text_a = line[1].replace('""', '"').replace('\\"', '"')
+        examples.append(
+            InputExample(guid=str(i), text_a=text_a, text_b=None, label=label))
+    return examples
+class ImdbProcessor(DataProcessor):
+  """ImdbProcessor."""
+  def get_labels(self):
+    return ["neg", "pos"]
+  def get_train_examples(self, data_dir):
+    return self._create_examples(os.path.join(data_dir, "train"))
+  def get_dev_examples(self, data_dir):
+    return self._create_examples(os.path.join(data_dir, "test"))
+  def _create_examples(self, data_dir):
+    """Creates examples."""
+    examples = []
+    for label in ["neg", "pos"]:
+      cur_dir = os.path.join(data_dir, label)
+      for filename in tf.io.gfile.listdir(cur_dir):
+        if not filename.endswith("txt"):
+          continue
+        if len(examples) % 1000 == 0:
+          logging.info("Loading dev example %d", len(examples))
+        path = os.path.join(cur_dir, filename)
+        with tf.io.gfile.GFile(path) as f:
+          text = f.read().strip().replace("<br />", " ")
+        examples.append(
+            InputExample(
+                guid="unused_id", text_a=text, text_b=None, label=label))
+    return examples
+class MnliMatchedProcessor(GLUEProcessor):
+  """MnliMatchedProcessor."""
+  def __init__(self):
+    super(MnliMatchedProcessor, self).__init__()
+    self.dev_file = "dev_matched.tsv"
+    self.test_file = "test_matched.tsv"
+    self.label_column = -1
+    self.text_a_column = 8
+    self.text_b_column = 9
+  def get_labels(self):
+    return ["contradiction", "entailment", "neutral"]
+class MnliMismatchedProcessor(MnliMatchedProcessor):
+  def __init__(self):
+    super(MnliMismatchedProcessor, self).__init__()
+    self.dev_file = "dev_mismatched.tsv"
+    self.test_file = "test_mismatched.tsv"
+class StsbProcessor(GLUEProcessor):
+  """StsbProcessor."""
+  def __init__(self):
+    super(StsbProcessor, self).__init__()
+    self.label_column = 9
+    self.text_a_column = 7
+    self.text_b_column = 8
+  def get_labels(self):
+    return [0.0]
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0 and self.contains_header and set_type != "test":
+        continue
+      if i == 0 and self.test_contains_header and set_type == "test":
+        continue
+      guid = "%s-%s" % (set_type, i)
+      a_column = (
+          self.text_a_column if set_type != "test" else self.test_text_a_column)
+      b_column = (
+          self.text_b_column if set_type != "test" else self.test_text_b_column)
+      # there are some incomplete lines in QNLI
+      if len(line) <= a_column:
+        logging.warning("Incomplete line, ignored.")
+        continue
+      text_a = line[a_column]
+      if b_column is not None:
+        if len(line) <= b_column:
+          logging.warning("Incomplete line, ignored.")
+          continue
+        text_b = line[b_column]
+      else:
+        text_b = None
+      if set_type == "test":
+        label = self.get_labels()[0]
+      else:
+        if len(line) <= self.label_column:
+          logging.warning("Incomplete line, ignored.")
+          continue
+        label = float(line[self.label_column])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+def file_based_convert_examples_to_features(examples,
+                                            label_list,
+                                            max_seq_length,
+                                            tokenize_fn,
+                                            output_file,
+                                            num_passes=1):
+  """Convert a set of `InputExample`s to a TFRecord file."""
+  # do not create duplicated records
+  if tf.io.gfile.exists(output_file) and not FLAGS.overwrite_data:
+    logging.info("Do not overwrite tfrecord %s exists.", output_file)
+    return
+  logging.info("Create new tfrecord %s.", output_file)
+  writer = tf.io.TFRecordWriter(output_file)
+  examples *= num_passes
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d", ex_index, len(examples))
+    feature = classifier_utils.convert_single_example(ex_index, example,
+                                                      label_list,
+                                                      max_seq_length,
+                                                      tokenize_fn)
+    def create_int_feature(values):
+      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+      return f
+    def create_float_feature(values):
+      f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+      return f
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_float_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    if label_list is not None:
+      features["label_ids"] = create_int_feature([feature.label_id])
+    else:
+      features["label_ids"] = create_float_feature([float(feature.label_id)])
+    features["is_real_example"] = create_int_feature(
+        [int(feature.is_real_example)])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+def main(_):
+  logging.set_verbosity(logging.INFO)
+  processors = {
+      "mnli_matched": MnliMatchedProcessor,
+      "mnli_mismatched": MnliMismatchedProcessor,
+      "sts-b": StsbProcessor,
+      "imdb": ImdbProcessor,
+      "yelp5": Yelp5Processor
+  }
+  task_name = FLAGS.task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % (task_name))
+  processor = processors[task_name]()
+  label_list = processor.get_labels() if not FLAGS.is_regression else None
+  sp = spm.SentencePieceProcessor()
+  sp.Load(FLAGS.spiece_model_file)
+  def tokenize_fn(text):
+    text = preprocess_utils.preprocess_text(text, lower=FLAGS.uncased)
+    return preprocess_utils.encode_ids(sp, text)
+  spm_basename = os.path.basename(FLAGS.spiece_model_file)
+  train_file_base = "{}.len-{}.train.tf_record".format(spm_basename,
+                                                       FLAGS.max_seq_length)
+  train_file = os.path.join(FLAGS.output_dir, train_file_base)
+  logging.info("Use tfrecord file %s", train_file)
+  train_examples = processor.get_train_examples(FLAGS.data_dir)
+  np.random.shuffle(train_examples)
+  logging.info("Num of train samples: %d", len(train_examples))
+  file_based_convert_examples_to_features(train_examples, label_list,
+                                          FLAGS.max_seq_length, tokenize_fn,
+                                          train_file, FLAGS.num_passes)
+  if FLAGS.eval_split == "dev":
+    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+  else:
+    eval_examples = processor.get_test_examples(FLAGS.data_dir)
+  logging.info("Num of eval samples: %d", len(eval_examples))
+  # TPU requires a fixed batch size for all batches, therefore the number
+  # of examples must be a multiple of the batch size, or else examples
+  # will get dropped. So we pad with fake examples which are ignored
+  # later on. These do NOT count towards the metric (all tf.metrics
+  # support a per-instance weight, and these get a weight of 0.0).
+  #
+  # Modified in XL: We also adopt the same mechanism for GPUs.
+  while len(eval_examples) % FLAGS.eval_batch_size != 0:
+    eval_examples.append(classifier_utils.PaddingInputExample())
+  eval_file_base = "{}.len-{}.{}.eval.tf_record".format(spm_basename,
+                                                        FLAGS.max_seq_length,
+                                                        FLAGS.eval_split)
+  eval_file = os.path.join(FLAGS.output_dir, eval_file_base)
+  file_based_convert_examples_to_features(eval_examples, label_list,
+                                          FLAGS.max_seq_length, tokenize_fn,
+                                          eval_file)
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  app.run(main)
--- a/official/nlp/xlnet/preprocess_pretrain_data.py
+++ b/official/nlp/xlnet/preprocess_pretrain_data.py
--- a/official/nlp/xlnet/preprocess_squad_data.py
+++ b/official/nlp/xlnet/preprocess_squad_data.py
+# coding=utf-8
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to pre-process SQUAD data into tfrecords."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import pickle
+import random
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import sentencepiece as spm
+from official.nlp.xlnet import squad_utils
+flags.DEFINE_integer(
+    "num_proc", default=1, help="Number of preprocessing processes.")
+flags.DEFINE_integer("proc_id", default=0, help="Process id for preprocessing.")
+# I/O paths
+flags.DEFINE_string("output_dir", default="", help="Output dir for TF records.")
+flags.DEFINE_string(
+    "spiece_model_file", default="", help="Sentence Piece model path.")
+flags.DEFINE_string("train_file", default="", help="Path of train file.")
+flags.DEFINE_string("predict_file", default="", help="Path of prediction file.")
+# Data preprocessing config
+flags.DEFINE_integer("max_seq_length", default=512, help="Max sequence length")
+flags.DEFINE_integer("max_query_length", default=64, help="Max query length")
+flags.DEFINE_integer("doc_stride", default=128, help="Doc stride")
+flags.DEFINE_bool("uncased", default=False, help="Use uncased data.")
+flags.DEFINE_bool(
+    "create_train_data", default=True, help="Whether to create training data.")
+flags.DEFINE_bool(
+    "create_eval_data", default=False, help="Whether to create eval data.")
+FLAGS = flags.FLAGS
+def _get_spm_basename():
+  spm_basename = os.path.basename(FLAGS.spiece_model_file)
+  return spm_basename
+def preprocess():
+  """Preprocesses SQUAD data."""
+  sp_model = spm.SentencePieceProcessor()
+  sp_model.Load(FLAGS.spiece_model_file)
+  spm_basename = _get_spm_basename()
+  if FLAGS.create_train_data:
+    train_rec_file = os.path.join(
+        FLAGS.output_dir,
+        "{}.{}.slen-{}.qlen-{}.train.tf_record".format(spm_basename,
+                                                       FLAGS.proc_id,
+                                                       FLAGS.max_seq_length,
+                                                       FLAGS.max_query_length))
+    logging.info("Read examples from %s", FLAGS.train_file)
+    train_examples = squad_utils.read_squad_examples(
+        FLAGS.train_file, is_training=True)
+    train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc]
+    # Pre-shuffle the input to avoid having to make a very large shuffle
+    # buffer in the `input_fn`.
+    random.shuffle(train_examples)
+    write_to_logging = "Write to " + train_rec_file
+    logging.info(write_to_logging)
+    train_writer = squad_utils.FeatureWriter(
+        filename=train_rec_file, is_training=True)
+    squad_utils.convert_examples_to_features(
+        examples=train_examples,
+        sp_model=sp_model,
+        max_seq_length=FLAGS.max_seq_length,
+        doc_stride=FLAGS.doc_stride,
+        max_query_length=FLAGS.max_query_length,
+        is_training=True,
+        output_fn=train_writer.process_feature,
+        uncased=FLAGS.uncased)
+    train_writer.close()
+  if FLAGS.create_eval_data:
+    eval_examples = squad_utils.read_squad_examples(
+        FLAGS.predict_file, is_training=False)
+    eval_rec_file = os.path.join(
+        FLAGS.output_dir,
+        "{}.slen-{}.qlen-{}.eval.tf_record".format(spm_basename,
+                                                   FLAGS.max_seq_length,
+                                                   FLAGS.max_query_length))
+    eval_feature_file = os.path.join(
+        FLAGS.output_dir,
+        "{}.slen-{}.qlen-{}.eval.features.pkl".format(spm_basename,
+                                                      FLAGS.max_seq_length,
+                                                      FLAGS.max_query_length))
+    eval_writer = squad_utils.FeatureWriter(
+        filename=eval_rec_file, is_training=False)
+    eval_features = []
+    def append_feature(feature):
+      eval_features.append(feature)
+      eval_writer.process_feature(feature)
+    squad_utils.convert_examples_to_features(
+        examples=eval_examples,
+        sp_model=sp_model,
+        max_seq_length=FLAGS.max_seq_length,
+        doc_stride=FLAGS.doc_stride,
+        max_query_length=FLAGS.max_query_length,
+        is_training=False,
+        output_fn=append_feature,
+        uncased=FLAGS.uncased)
+    eval_writer.close()
+    with tf.io.gfile.GFile(eval_feature_file, "wb") as fout:
+      pickle.dump(eval_features, fout)
+def main(_):
+  logging.set_verbosity(logging.INFO)
+  if not tf.io.gfile.exists(FLAGS.output_dir):
+    tf.io.gfile.mkdir(FLAGS.output_dir)
+  preprocess()
+if __name__ == "__main__":
+  app.run(main)
--- a/official/nlp/xlnet/preprocess_utils.py
+++ b/official/nlp/xlnet/preprocess_utils.py
+# coding=utf-8
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for pre-processing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unicodedata
+import six
+SPIECE_UNDERLINE = '▁'
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode('utf-8', 'ignore')
+    else:
+      raise ValueError('Unsupported string type: %s' % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode('utf-8')
+    else:
+      raise ValueError('Unsupported string type: %s' % (type(text)))
+  else:
+    raise ValueError('Not running on Python2 or Python 3?')
+def print_(*args):
+  new_args = []
+  for arg in args:
+    if isinstance(arg, list):
+      s = [printable_text(i) for i in arg]
+      s = ' '.join(s)
+      new_args.append(s)
+    else:
+      new_args.append(printable_text(arg))
+  print(*new_args)
+def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
+  """Preprocesses texts."""
+  if remove_space:
+    outputs = ' '.join(inputs.strip().split())
+  else:
+    outputs = inputs
+  outputs = outputs.replace('``', '"').replace("''", '"')
+  if six.PY2 and isinstance(outputs, str):
+    outputs = outputs.decode('utf-8')
+  if not keep_accents:
+    outputs = unicodedata.normalize('NFKD', outputs)
+    outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+  if lower:
+    outputs = outputs.lower()
+  return outputs
+def encode_pieces(sp_model, text, return_unicode=True, sample=False):
+  """Encodes pieces."""
+  # return_unicode is used only for py2
+  if six.PY2 and isinstance(text, unicode):
+    text = text.encode('utf-8')
+  if not sample:
+    pieces = sp_model.EncodeAsPieces(text)
+  else:
+    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+  new_pieces = []
+  for piece in pieces:
+    if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+      cur_pieces = sp_model.EncodeAsPieces(
+          piece[:-1].replace(SPIECE_UNDERLINE, ''))
+      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+        if len(cur_pieces[0]) == 1:
+          cur_pieces = cur_pieces[1:]
+        else:
+          cur_pieces[0] = cur_pieces[0][1:]
+      cur_pieces.append(piece[-1])
+      new_pieces.extend(cur_pieces)
+    else:
+      new_pieces.append(piece)
+  # note(zhiliny): convert back to unicode for py2
+  if six.PY2 and return_unicode:
+    ret_pieces = []
+    for piece in new_pieces:
+      if isinstance(piece, str):
+        piece = piece.decode('utf-8')
+      ret_pieces.append(piece)
+    new_pieces = ret_pieces
+  return new_pieces
+def encode_ids(sp_model, text, sample=False):
+  pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)
+  ids = [sp_model.PieceToId(piece) for piece in pieces]
+  return ids
--- a/official/nlp/xlnet/run_classifier.py
+++ b/official/nlp/xlnet/run_classifier.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""XLNet classification finetuning runner in tf2.0."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import functools
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+# pylint: disable=unused-import
+# Initialize TPU System.
+from official.nlp import xlnet_config
+from official.nlp import xlnet_modeling as modeling
+from official.nlp.xlnet import common_flags
+from official.nlp.xlnet import data_utils
+from official.nlp.xlnet import optimization
+from official.nlp.xlnet import training_utils
+flags.DEFINE_integer("n_class", default=2, help="Number of classes.")
+FLAGS = flags.FLAGS
+def get_classificationxlnet_model(model_config, run_config, n_class):
+  model = modeling.ClassificationXLNetModel(
+      model_config, run_config, n_class, name="model")
+  return model
+def run_evaluation(strategy,
+                   test_input_fn,
+                   eval_steps,
+                   model,
+                   step,
+                   eval_summary_writer=None):
+  """Run evaluation for classification task.
+  Args:
+    strategy: distribution strategy.
+    test_input_fn: input function for evaluation data.
+    eval_steps: total number of evaluation steps.
+    model: keras model object.
+    step: current train step.
+    eval_summary_writer: summary writer used to record evaluation metrics.  As
+      there are fake data samples in validation set, we use mask to get rid of
+      them when calculating the accuracy. For the reason that there will be
+      dynamic-shape tensor, we first collect logits, labels and masks from TPU
+      and calculate the accuracy via numpy locally.
+  """
+  def _test_step_fn(inputs):
+    """Replicated validation step."""
+    inputs["mems"] = None
+    _, logits = model(inputs, training=False)
+    return logits, inputs["label_ids"], inputs["is_real_example"]
+  @tf.function
+  def _run_evaluation(test_iterator):
+    """Runs validation steps."""
+    logits, labels, masks = strategy.experimental_run_v2(
+        _test_step_fn, args=(next(test_iterator),))
+    return logits, labels, masks
+  # pylint: disable=protected-access
+  test_iterator = data_utils._get_input_iterator(test_input_fn, strategy)
+  # pylint: enable=protected-access
+  correct = 0
+  total = 0
+  for _ in range(eval_steps):
+    logits, labels, masks = _run_evaluation(test_iterator)
+    logits = strategy.experimental_local_results(logits)
+    labels = strategy.experimental_local_results(labels)
+    masks = strategy.experimental_local_results(masks)
+    merged_logits = []
+    merged_labels = []
+    merged_masks = []
+    for i in range(strategy.num_replicas_in_sync):
+      merged_logits.append(logits[i].numpy())
+      merged_labels.append(labels[i].numpy())
+      merged_masks.append(masks[i].numpy())
+    merged_logits = np.vstack(np.array(merged_logits))
+    merged_labels = np.hstack(np.array(merged_labels))
+    merged_masks = np.hstack(np.array(merged_masks))
+    real_index = np.where(np.equal(merged_masks, 1))
+    correct += np.sum(
+        np.equal(
+            np.argmax(merged_logits[real_index], axis=-1),
+            merged_labels[real_index]))
+    total += np.shape(real_index)[-1]
+  logging.info("Train step: %d  /  acc = %d/%d = %f", step, correct, total,
+               float(correct) / float(total))
+  if eval_summary_writer:
+    with eval_summary_writer.as_default():
+      tf.summary.scalar("eval_acc", float(correct) / float(total), step=step)
+      eval_summary_writer.flush()
+def get_metric_fn():
+  train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy(
+      "acc", dtype=tf.float32)
+  return train_acc_metric
+def get_primary_cpu_task(use_remote_tpu=False):
+  """Returns primary CPU task to which input pipeline Ops are put."""
+  # Remote Eager Borg job configures the TPU worker with job name 'worker'.
+  return "/job:worker" if use_remote_tpu else ""
+def main(unused_argv):
+  del unused_argv
+  use_remote_tpu = False
+  if FLAGS.strategy_type == "mirror":
+    strategy = tf.distribute.MirroredStrategy()
+  elif FLAGS.strategy_type == "tpu":
+    cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
+    strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
+    use_remote_tpu = True
+  else:
+    raise ValueError("The distribution strategy type is not supported: %s" %
+                     FLAGS.strategy_type)
+  if strategy:
+    logging.info("***** Number of cores used : %d",
+                 strategy.num_replicas_in_sync)
+  train_input_fn = functools.partial(data_utils.get_classification_input_data,
+                                     FLAGS.train_batch_size, FLAGS.seq_len,
+                                     strategy, True, FLAGS.train_tfrecord_path)
+  test_input_fn = functools.partial(data_utils.get_classification_input_data,
+                                    FLAGS.test_batch_size, FLAGS.seq_len,
+                                    strategy, False, FLAGS.test_tfrecord_path)
+  total_training_steps = FLAGS.train_steps
+  steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
+  steps_per_loop = FLAGS.iterations
+  eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size)
+  eval_fn = functools.partial(run_evaluation, strategy, test_input_fn,
+                              eval_steps)
+  optimizer, learning_rate_fn = optimization.create_optimizer(
+      FLAGS.learning_rate,
+      total_training_steps,
+      FLAGS.warmup_steps,
+      adam_epsilon=FLAGS.adam_epsilon)
+  model_config = xlnet_config.XLNetConfig(FLAGS)
+  run_config = xlnet_config.create_run_config(True, False, FLAGS)
+  model_fn = functools.partial(get_classificationxlnet_model, model_config,
+                               run_config, FLAGS.n_class)
+  input_meta_data = {}
+  input_meta_data["d_model"] = FLAGS.d_model
+  input_meta_data["mem_len"] = FLAGS.mem_len
+  input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
+                                               strategy.num_replicas_in_sync)
+  input_meta_data["n_layer"] = FLAGS.n_layer
+  input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
+  input_meta_data["n_class"] = FLAGS.n_class
+  print("DEBUG: ", str(input_meta_data))
+  def logits_init_fn():
+    return tf.zeros(
+        shape=(input_meta_data["batch_size_per_core"],
+               input_meta_data["n_class"]),
+        dtype=tf.float32)
+  with tf.device(get_primary_cpu_task(use_remote_tpu)):
+    training_utils.train(
+        strategy=strategy,
+        model_fn=model_fn,
+        input_meta_data=input_meta_data,
+        eval_fn=eval_fn,
+        metric_fn=get_metric_fn,
+        logits_init_fn=logits_init_fn,
+        train_input_fn=train_input_fn,
+        test_input_fn=test_input_fn,
+        init_checkpoint=FLAGS.init_checkpoint,
+        total_training_steps=total_training_steps,
+        steps_per_epoch=steps_per_epoch,
+        steps_per_loop=steps_per_loop,
+        optimizer=optimizer,
+        learning_rate_fn=learning_rate_fn,
+        model_dir=FLAGS.model_dir)
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  app.run(main)
--- a/official/nlp/xlnet/run_pretrain.py
+++ b/official/nlp/xlnet/run_pretrain.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""XLNet classification finetuning runner in tf2.0."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import functools
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+# pylint: disable=unused-import
+# Initialize TPU System.
+from official.nlp import xlnet_config
+from official.nlp import xlnet_modeling as modeling
+from official.nlp.xlnet import common_flags
+from official.nlp.xlnet import data_utils
+from official.nlp.xlnet import optimization
+from official.nlp.xlnet import training_utils
+flags.DEFINE_integer(
+    "mask_alpha", default=6, help="How many tokens to form a group.")
+flags.DEFINE_integer(
+    "mask_beta", default=1, help="How many tokens to mask within each group.")
+flags.DEFINE_integer(
+    "num_predict",
+    default=None,
+    help="Number of tokens to predict in partial prediction.")
+flags.DEFINE_integer("perm_size", 0, help="Window size of permutation.")
+FLAGS = flags.FLAGS
+def get_pretrainxlnet_model(model_config, run_config):
+  model = modeling.PretrainingXLNetModel(model_config, run_config, name="model")
+  return model
+def get_primary_cpu_task(use_remote_tpu=False):
+  """Returns primary CPU task to which input pipeline Ops are put."""
+  # Remote Eager Borg job configures the TPU worker with job name 'worker'.
+  return "/job:worker" if use_remote_tpu else ""
+def main(unused_argv):
+  del unused_argv
+  use_remote_tpu = False
+  num_hosts = 1
+  if FLAGS.strategy_type == "mirror":
+    strategy = tf.distribute.MirroredStrategy()
+  elif FLAGS.strategy_type == "tpu":
+    cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
+    strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
+    use_remote_tpu = True
+    topology = FLAGS.tpu_topology.split("x")
+    total_num_core = 2 * int(topology[0]) * int(topology[1])
+    num_hosts = total_num_core // FLAGS.num_core_per_host
+  else:
+    raise ValueError("The distribution strategy type is not supported: %s" %
+                     FLAGS.strategy_type)
+  if strategy:
+    logging.info("***** Number of cores used : %d",
+                 strategy.num_replicas_in_sync)
+    logging.info("***** Number of hosts used : %d",
+                 num_hosts)
+  train_input_fn = functools.partial(
+      data_utils.get_pretrain_input_data, FLAGS.train_batch_size, FLAGS.seq_len,
+      strategy, FLAGS.train_tfrecord_path, FLAGS.reuse_len, FLAGS.perm_size,
+      FLAGS.mask_alpha, FLAGS.mask_beta, FLAGS.num_predict, FLAGS.bi_data,
+      FLAGS.uncased, num_hosts)
+  total_training_steps = FLAGS.train_steps
+  steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
+  steps_per_loop = FLAGS.iterations
+  optimizer, learning_rate_fn = optimization.create_optimizer(
+      init_lr=FLAGS.learning_rate,
+      num_train_steps=total_training_steps,
+      num_warmup_steps=FLAGS.warmup_steps,
+      min_lr_ratio=FLAGS.min_lr_ratio,
+      adam_epsilon=FLAGS.adam_epsilon,
+      weight_decay_rate=FLAGS.weight_decay_rate)
+  model_config = xlnet_config.XLNetConfig(FLAGS)
+  run_config = xlnet_config.create_run_config(True, False, FLAGS)
+  input_meta_data = {}
+  input_meta_data["d_model"] = FLAGS.d_model
+  input_meta_data["mem_len"] = FLAGS.mem_len
+  input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
+                                               strategy.num_replicas_in_sync)
+  input_meta_data["n_layer"] = FLAGS.n_layer
+  input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
+  model_fn = functools.partial(get_pretrainxlnet_model, model_config,
+                               run_config)
+  def logits_init_fn():
+    return tf.zeros(
+        shape=(FLAGS.num_predict, input_meta_data["batch_size_per_core"],
+               FLAGS.d_model),
+        dtype=tf.float32)
+  with tf.device(get_primary_cpu_task(use_remote_tpu)):
+    training_utils.train(
+        strategy=strategy,
+        model_fn=model_fn,
+        input_meta_data=input_meta_data,
+        eval_fn=None,
+        metric_fn=None,
+        logits_init_fn=logits_init_fn,
+        train_input_fn=train_input_fn,
+        test_input_fn=None,
+        init_checkpoint=FLAGS.init_checkpoint,
+        total_training_steps=total_training_steps,
+        steps_per_epoch=steps_per_epoch,
+        steps_per_loop=steps_per_loop,
+        optimizer=optimizer,
+        learning_rate_fn=learning_rate_fn,
+        model_dir=FLAGS.model_dir,
+        save_steps=FLAGS.save_steps)
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  app.run(main)
--- a/official/nlp/xlnet/run_squad.py
+++ b/official/nlp/xlnet/run_squad.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""XLNet SQUAD finetuning runner in tf2.0."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import functools
+import json
+import os
+import pickle
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+# pylint: disable=unused-import
+# Initialize TPU System.
+from official.nlp import xlnet_config
+from official.nlp import xlnet_modeling as modeling
+from official.nlp.xlnet import common_flags
+from official.nlp.xlnet import data_utils
+from official.nlp.xlnet import optimization
+from official.nlp.xlnet import squad_utils
+from official.nlp.xlnet import training_utils
+flags.DEFINE_string(
+    "test_feature_path", default=None, help="Path to feature of test set.")
+flags.DEFINE_integer("query_len", default=64, help="Max query length.")
+flags.DEFINE_integer("start_n_top", default=5, help="Beam size for span start.")
+flags.DEFINE_integer("end_n_top", default=5, help="Beam size for span end.")
+flags.DEFINE_string(
+    "predict_dir", default=None, help="Path to write predictions.")
+flags.DEFINE_string(
+    "predict_file", default=None, help="Path to json file of test set.")
+flags.DEFINE_integer(
+    "n_best_size", default=5, help="n best size for predictions.")
+flags.DEFINE_integer("max_answer_length", default=64, help="Max answer length.")
+FLAGS = flags.FLAGS
+class InputFeatures(object):
+  """A single set of features of data."""
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tok_start_to_orig_index,
+               tok_end_to_orig_index,
+               token_is_max_context,
+               input_ids,
+               input_mask,
+               p_mask,
+               segment_ids,
+               paragraph_len,
+               cls_index,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tok_start_to_orig_index = tok_start_to_orig_index
+    self.tok_end_to_orig_index = tok_end_to_orig_index
+    self.token_is_max_context = token_is_max_context
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.p_mask = p_mask
+    self.segment_ids = segment_ids
+    self.paragraph_len = paragraph_len
+    self.cls_index = cls_index
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+def get_primary_cpu_task(use_remote_tpu=False):
+  """Returns primary CPU task to which input pipeline Ops are put."""
+  # Remote Eager Borg job configures the TPU worker with job name 'worker'.
+  return "/job:worker" if use_remote_tpu else ""
+# pylint: disable=unused-argument
+def run_evaluation(strategy,
+                   test_input_fn,
+                   eval_steps,
+                   input_meta_data,
+                   model,
+                   step,
+                   eval_summary_writer=None):
+  """Run evaluation for SQUAD task.
+  Args:
+    strategy: distribution strategy.
+    test_input_fn: input function for evaluation data.
+    eval_steps: total number of evaluation steps.
+    input_meta_data: input meta data.
+    model: keras model object.
+    step: current training step.
+    eval_summary_writer: summary writer used to record evaluation metrics.
+  """
+  def _test_step_fn(inputs):
+    """Replicated validation step."""
+    inputs["mems"] = None
+    res = model(inputs, training=False)
+    return res, inputs["unique_ids"]
+  @tf.function
+  def _run_evaluation(test_iterator):
+    """Runs validation steps."""
+    res, unique_ids = strategy.experimental_run_v2(
+        _test_step_fn, args=(next(test_iterator),))
+    return res, unique_ids
+  # pylint: disable=protected-access
+  test_iterator = data_utils._get_input_iterator(test_input_fn, strategy)
+  # pylint: enable=protected-access
+  cur_results = []
+  eval_examples = squad_utils.read_squad_examples(
+      input_meta_data["predict_file"], is_training=False)
+  with tf.io.gfile.GFile(input_meta_data["predict_file"]) as f:
+    orig_data = json.load(f)["data"]
+  for _ in range(eval_steps):
+    results, unique_ids = _run_evaluation(test_iterator)
+    unique_ids = strategy.experimental_local_results(unique_ids)
+    for result_key in results:
+      results[result_key] = (
+          strategy.experimental_local_results(results[result_key]))
+    for core_i in range(strategy.num_replicas_in_sync):
+      bsz = int(input_meta_data["test_batch_size"] /
+                strategy.num_replicas_in_sync)
+      for j in range(bsz):
+        result = {}
+        for result_key in results:
+          result[result_key] = results[result_key][core_i].numpy()[j]
+        result["unique_ids"] = unique_ids[core_i].numpy()[j]
+        # We appended a fake example into dev set to make data size can be
+        # divided by test_batch_size. Ignores this fake example during
+        # evaluation.
+        if result["unique_ids"] == 1000012047:
+          continue
+        unique_id = int(result["unique_ids"])
+        start_top_log_probs = ([
+            float(x) for x in result["start_top_log_probs"].flat
+        ])
+        start_top_index = [int(x) for x in result["start_top_index"].flat]
+        end_top_log_probs = ([
+            float(x) for x in result["end_top_log_probs"].flat
+        ])
+        end_top_index = [int(x) for x in result["end_top_index"].flat]
+        cls_logits = float(result["cls_logits"].flat[0])
+        cur_results.append(
+            squad_utils.RawResult(
+                unique_id=unique_id,
+                start_top_log_probs=start_top_log_probs,
+                start_top_index=start_top_index,
+                end_top_log_probs=end_top_log_probs,
+                end_top_index=end_top_index,
+                cls_logits=cls_logits))
+        if len(cur_results) % 1000 == 0:
+          logging.info("Processing example: %d", len(cur_results))
+  output_prediction_file = os.path.join(input_meta_data["predict_dir"],
+                                        "predictions.json")
+  output_nbest_file = os.path.join(input_meta_data["predict_dir"],
+                                   "nbest_predictions.json")
+  output_null_log_odds_file = os.path.join(input_meta_data["predict_dir"],
+                                           "null_odds.json")
+  ret = squad_utils.write_predictions(
+      eval_examples, input_meta_data["eval_features"], cur_results,
+      input_meta_data["n_best_size"], input_meta_data["max_answer_length"],
+      output_prediction_file, output_nbest_file, output_null_log_odds_file,
+      orig_data, input_meta_data["start_n_top"], input_meta_data["end_n_top"])
+  # Log current result
+  log_str = "Result | "
+  for key, val in ret.items():
+    log_str += "{} {} | ".format(key, val)
+  logging.info(log_str)
+  if eval_summary_writer:
+    with eval_summary_writer.as_default():
+      tf.summary.scalar("best_f1", ret["best_f1"], step=step)
+      tf.summary.scalar("best_exact", ret["best_exact"], step=step)
+      eval_summary_writer.flush()
+def get_qaxlnet_model(model_config, run_config, start_n_top, end_n_top):
+  model = modeling.QAXLNetModel(
+      model_config,
+      run_config,
+      start_n_top=start_n_top,
+      end_n_top=end_n_top,
+      name="model")
+  return model
+def main(unused_argv):
+  del unused_argv
+  use_remote_tpu = False
+  if FLAGS.strategy_type == "mirror":
+    strategy = tf.distribute.MirroredStrategy()
+  elif FLAGS.strategy_type == "tpu":
+    cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu)
+    strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
+    use_remote_tpu = True
+  else:
+    raise ValueError("The distribution strategy type is not supported: %s" %
+                     FLAGS.strategy_type)
+  if strategy:
+    logging.info("***** Number of cores used : %d",
+                 strategy.num_replicas_in_sync)
+  train_input_fn = functools.partial(data_utils.get_squad_input_data,
+                                     FLAGS.train_batch_size, FLAGS.seq_len,
+                                     FLAGS.query_len, strategy, True,
+                                     FLAGS.train_tfrecord_path)
+  test_input_fn = functools.partial(data_utils.get_squad_input_data,
+                                    FLAGS.test_batch_size, FLAGS.seq_len,
+                                    FLAGS.query_len, strategy, False,
+                                    FLAGS.test_tfrecord_path)
+  total_training_steps = FLAGS.train_steps
+  steps_per_epoch = int(FLAGS.train_data_size / FLAGS.train_batch_size)
+  steps_per_loop = FLAGS.iterations
+  eval_steps = int(FLAGS.test_data_size / FLAGS.test_batch_size)
+  optimizer, learning_rate_fn = optimization.create_optimizer(
+      FLAGS.learning_rate,
+      total_training_steps,
+      FLAGS.warmup_steps,
+      adam_epsilon=FLAGS.adam_epsilon)
+  model_config = xlnet_config.XLNetConfig(FLAGS)
+  run_config = xlnet_config.create_run_config(True, False, FLAGS)
+  input_meta_data = {}
+  input_meta_data["start_n_top"] = FLAGS.start_n_top
+  input_meta_data["end_n_top"] = FLAGS.end_n_top
+  input_meta_data["lr_layer_decay_rate"] = FLAGS.lr_layer_decay_rate
+  input_meta_data["predict_dir"] = FLAGS.predict_dir
+  input_meta_data["predict_file"] = FLAGS.predict_file
+  input_meta_data["n_best_size"] = FLAGS.n_best_size
+  input_meta_data["max_answer_length"] = FLAGS.max_answer_length
+  input_meta_data["test_feature_path"] = FLAGS.test_feature_path
+  input_meta_data["test_batch_size"] = FLAGS.test_batch_size
+  input_meta_data["batch_size_per_core"] = int(FLAGS.train_batch_size /
+                                               strategy.num_replicas_in_sync)
+  input_meta_data["mem_len"] = FLAGS.mem_len
+  model_fn = functools.partial(get_qaxlnet_model, model_config, run_config,
+                               FLAGS.start_n_top, FLAGS.end_n_top)
+  def logits_init_fn():
+    return tf.zeros(
+        shape=(input_meta_data["batch_size_per_core"]), dtype=tf.float32)
+  logging.info("start reading pickle file...")
+  with tf.io.gfile.GFile(input_meta_data["test_feature_path"], "rb") as f:
+    eval_features = pickle.load(f)
+  logging.info("finishing reading pickle file...")
+  input_meta_data["eval_features"] = eval_features
+  eval_fn = functools.partial(run_evaluation, strategy, test_input_fn,
+                              eval_steps, input_meta_data)
+  with tf.device(get_primary_cpu_task(use_remote_tpu)):
+    training_utils.train(
+        strategy=strategy,
+        model_fn=model_fn,
+        input_meta_data=input_meta_data,
+        eval_fn=eval_fn,
+        metric_fn=None,
+        logits_init_fn=logits_init_fn,
+        train_input_fn=train_input_fn,
+        test_input_fn=test_input_fn,
+        init_checkpoint=FLAGS.init_checkpoint,
+        total_training_steps=total_training_steps,
+        steps_per_epoch=steps_per_epoch,
+        steps_per_loop=steps_per_loop,
+        optimizer=optimizer,
+        learning_rate_fn=learning_rate_fn,
+        model_dir=FLAGS.model_dir)
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  app.run(main)
--- a/official/nlp/xlnet/squad_utils.py
+++ b/official/nlp/xlnet/squad_utils.py
--- a/official/nlp/xlnet/training_utils.py
+++ b/official/nlp/xlnet/training_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""XLNet classification finetuning runner in tf2.0."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import os
+import re
+from absl import logging
+# pytype: disable=attribute-error
+# pylint: disable=g-bare-generic,unused-import
+import tensorflow as tf
+# Initialize TPU System.
+from official.nlp.xlnet import data_utils
+from official.nlp import xlnet_modeling as modeling
+from typing import Any, Callable, Dict, Text, Optional
+_MIN_SUMMARY_STEPS = 10
+def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to with provided checkpoint prefix."""
+  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+  saved_path = checkpoint.save(checkpoint_path)
+  logging.info("Saving model as TF checkpoint: %s", saved_path)
+  return
+def _float_metric_value(metric):
+  """Gets the value of a float-value keras metric."""
+  return metric.result().numpy().astype(float)
+def _steps_to_run(current_step, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError("steps_per_loop should be positive integer.")
+  if steps_per_loop == 1:
+    return steps_per_loop
+  remainder_in_epoch = current_step % steps_per_epoch
+  if remainder_in_epoch != 0:
+    return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
+  else:
+    return steps_per_loop
+def train(
+    strategy: tf.distribute.Strategy,
+    model_fn: Callable,
+    input_meta_data: Dict,
+    logits_init_fn: Callable[[], tf.Tensor],
+    train_input_fn: Callable,
+    total_training_steps: int,
+    steps_per_epoch: int,
+    steps_per_loop: int,
+    optimizer: tf.keras.optimizers.Optimizer,
+    learning_rate_fn: tf.keras.optimizers.schedules.LearningRateSchedule,
+    eval_fn: Optional[Callable[[tf.keras.Model, int, tf.summary.SummaryWriter],
+                               Any]] = None,
+    metric_fn: Optional[Callable[[], tf.keras.metrics.Metric]] = None,
+    test_input_fn: Optional[Callable] = None,
+    init_checkpoint: Optional[Text] = None,
+    model_dir: Optional[Text] = None,
+    save_steps: Optional[int] = None):
+  """Runs customized training.
+  Args:
+      strategy: Distribution strategy on which to run low level training loop.
+      model_fn: The function returns a keras.Model.
+      input_meta_data: A dictionary of params: `mem_len`, `lr_layer_decay_rate`,
+        `n_layer`, `batch_size_per_core` and `d_model`.
+      logits_init_fn: Function creates a dummy logits tensor.
+      train_input_fn: Function returns a tf.data.Dataset used for training.
+      total_training_steps: Number of steps to train in total.
+      steps_per_epoch: Number of steps to run per epoch. At the end of each
+        epoch, model checkpoint will be saved and evaluation will be conducted
+        if evaluation dataset is provided.
+      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
+        communication in eager context, training logs are printed every
+        steps_per_loop.
+      optimizer: The optimizer for model.
+      learning_rate_fn: the learning rate schedule.
+      eval_fn: A callback of evaluation function, that takes a keras.Model,
+        current step and evaluation summary writer.
+      metric_fn: A metrics function returns a Keras Metric object to record
+        evaluation result using evaluation dataset or with training dataset
+        after every epoch.
+      test_input_fn:  Function returns a evaluation dataset. If none, evaluation
+        is skipped.
+      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
+        `model_fn`.
+      model_dir: The directory of model (checkpoints, summaries).
+      save_steps: The frequency to save checkpoints. Every save_steps, we save a
+        model checkpoint.
+  Returns:
+      Last training step logits if training happens, otherwise returns None.
+  Raises:
+    TypeError: if model directory is not specified.
+  """
+  required_arguments = [
+      logits_init_fn, train_input_fn, total_training_steps, steps_per_epoch,
+      steps_per_loop, optimizer, learning_rate_fn
+  ]
+  if [arg for arg in required_arguments if arg is None]:
+    raise ValueError(
+        "`logits_init_fn`, `train_input_fn`, `total_training_steps`, "
+        "`steps_per_epoch`, `steps_per_loop`, `optimizer` and "
+        "`learning_rate_fn` are required parameters.")
+  if not model_dir:
+    raise TypeError("Model directory must be specified.")
+  # pylint: disable=protected-access
+  train_iterator = data_utils._get_input_iterator(train_input_fn, strategy)
+  # pylint: enable=protected-access
+  train_summary_writer = None
+  eval_summary_writer = None
+  if not tf.io.gfile.exists(model_dir):
+    tf.io.gfile.mkdir(model_dir)
+  if test_input_fn:
+    eval_summary_writer = tf.summary.create_file_writer(
+        os.path.join(model_dir, "summaries/eval"))
+  if steps_per_loop >= _MIN_SUMMARY_STEPS:
+    # Only writes summary when the stats are collected sufficiently over
+    # enough steps.
+    train_summary_writer = tf.summary.create_file_writer(
+        os.path.join(model_dir, "summaries/train"))
+  with strategy.scope():
+    model = model_fn()
+    if init_checkpoint:
+      logging.info("restore from %s", init_checkpoint)
+      checkpoint = tf.train.Checkpoint(model=model)
+      checkpoint.restore(init_checkpoint)
+    model.optimizer = optimizer
+    if not hasattr(model, "optimizer"):
+      raise ValueError("User should set optimizer attribute to model.")
+    train_loss_metric = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    train_metric = None
+    if metric_fn:
+      train_metric = metric_fn()
+    def _replicated_step(inputs, mem=None):
+      """Replicated training step."""
+      inputs["mems"] = mem
+      with tf.GradientTape() as tape:
+        mem, logits = model(inputs, training=True)
+        loss = model.losses
+        train_loss_metric.update_state(loss)
+        if train_metric:
+          train_metric.update_state(inputs["label_ids"], logits)
+        scaled_loss = loss[0] * 1.0 / float(strategy.num_replicas_in_sync)
+      # Collects training variables.
+      tvars = model.trainable_variables
+      grads = tape.gradient(scaled_loss, tvars)
+      clipped, _ = tf.clip_by_global_norm(grads, clip_norm=1.0)
+      if input_meta_data["lr_layer_decay_rate"] != 1.0:
+        n_layer = 0
+        for i in range(len(clipped)):
+          m = re.search(r"model/transformer/layer_(\d+?)/", tvars[i].name)
+          if not m:
+            continue
+          n_layer = max(n_layer, int(m.group(1)) + 1)
+        for i in range(len(clipped)):
+          for l in range(n_layer):
+            if "model/transformer/layer_{}/".format(l) in tvars[i].name:
+              abs_rate = input_meta_data["lr_layer_decay_rate"]**(
+                  n_layer - 1 - l)
+              clipped[i] *= abs_rate
+              logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(
+                  abs_rate, l, tvars[i].name))
+              break
+      optimizer.apply_gradients(zip(clipped, tvars))
+      if input_meta_data["mem_len"] > 0:
+        return mem, logits
+      else:
+        return logits
+    @tf.function
+    def train_steps(iterator, steps):
+      """Performs distributed training steps in a loop.
+      Args:
+        iterator: the distributed iterator of training datasets.
+        steps: an tf.int32 integer tensor to specify number of steps to run
+          inside host training loop.
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      Returns:
+        logits: logits computed.
+      """
+      if not isinstance(steps, tf.Tensor):
+        raise ValueError("steps should be an Tensor. Python object may cause "
+                         "retracing.")
+      def cache_fn():
+        """Initializes memory tensor used in XLNet pretraining."""
+        mems = []
+        if input_meta_data["mem_len"] > 0:
+          for _ in range(input_meta_data["n_layer"]):
+            zeros = tf.zeros([
+                input_meta_data["mem_len"],
+                input_meta_data["batch_size_per_core"],
+                input_meta_data["d_model"]
+            ],
+                             dtype=tf.float32)
+            mems.append(zeros)
+        return mems
+      logits = strategy.experimental_run_v2(logits_init_fn)
+      if input_meta_data["mem_len"] > 0:
+        mem = strategy.experimental_run_v2(cache_fn)
+        for _ in tf.range(steps):
+          mem, logits = strategy.experimental_run_v2(
+              _replicated_step, args=(
+                  next(iterator),
+                  mem,
+              ))
+      else:
+        for _ in tf.range(steps):
+          logits = strategy.experimental_run_v2(
+              _replicated_step, args=(next(iterator),))
+      return logits
+    logging.info("Start training...")
+    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+    if latest_checkpoint_file:
+      logging.info("Checkpoint file %s found and restoring from checkpoint",
+                   latest_checkpoint_file)
+      checkpoint.restore(latest_checkpoint_file)
+      logging.info("Loading from checkpoint file completed")
+    current_step = optimizer.iterations.numpy()
+    checkpoint_name = "xlnet_step_{step}.ckpt"
+    logits = None
+    while current_step < total_training_steps:
+      train_loss_metric.reset_states()
+      if train_metric:
+        train_metric.reset_states()
+      steps = _steps_to_run(current_step, steps_per_epoch, steps_per_loop)
+      logits = train_steps(train_iterator,
+                           tf.convert_to_tensor(steps, dtype=tf.int32))
+      current_step += steps
+      train_loss = _float_metric_value(train_loss_metric)
+      log_stream = "Train step: %d/%d  /  lr = %.9f  /  loss = %.7f" % (
+          current_step, total_training_steps, learning_rate_fn(current_step),
+          train_loss)
+      if train_metric:
+        log_stream += "  /  %s = %f" % (train_metric.name,
+                                        _float_metric_value(train_metric))
+      logging.info(log_stream)
+      if train_summary_writer:
+        with train_summary_writer.as_default():
+          tf.summary.scalar(
+              "learning_rate",
+              learning_rate_fn(current_step),
+              step=current_step)
+          tf.summary.scalar(
+              train_loss_metric.name, train_loss, step=current_step)
+          if train_metric:
+            tf.summary.scalar(
+                train_metric.name,
+                _float_metric_value(train_metric),
+                step=current_step)
+          train_summary_writer.flush()
+      if model_dir:
+        if (save_steps is None) or (save_steps and
+                                    current_step % save_steps == 0):
+          _save_checkpoint(checkpoint, model_dir,
+                           checkpoint_name.format(step=current_step))
+      if test_input_fn and current_step % steps_per_epoch == 0:
+        logging.info("Running evaluation after step: %s.", current_step)
+        eval_fn(model, current_step, eval_summary_writer)
+    if model_dir:
+      _save_checkpoint(checkpoint, model_dir,
+                       checkpoint_name.format(step=current_step))
+    if test_input_fn:
+      logging.info("Running final evaluation after training is complete.")
+      eval_fn(model, current_step, eval_summary_writer)
+    return logits
--- a/official/nlp/xlnet_config.py
+++ b/official/nlp/xlnet_config.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions used in XLNet model."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import json
+import os
+import tensorflow as tf
+def create_run_config(is_training, is_finetune, flags):
+  """Helper function for creating RunConfig."""
+  kwargs = dict(
+      is_training=is_training,
+      use_tpu=flags.use_tpu,
+      use_bfloat16=flags.use_bfloat16,
+      dropout=flags.dropout,
+      dropout_att=flags.dropout_att,
+      init_method=flags.init_method,
+      init_range=flags.init_range,
+      init_std=flags.init_std,
+      clamp_len=flags.clamp_len)
+  if not is_finetune:
+    kwargs.update(dict(
+        mem_len=flags.mem_len,
+        reuse_len=flags.reuse_len,
+        bi_data=flags.bi_data,
+        clamp_len=flags.clamp_len,
+        same_length=flags.same_length))
+  return RunConfig(**kwargs)
+class XLNetConfig(object):
+  """Configs for XLNet model.
+  XLNetConfig contains hyperparameters that are specific to a model checkpoint;
+  i.e., these hyperparameters should be the same between
+  pretraining and finetuning.
+  The following hyperparameters are defined:
+    n_layer: int, the number of layers.
+    d_model: int, the hidden size.
+    n_head: int, the number of attention heads.
+    d_head: int, the dimension size of each attention head.
+    d_inner: int, the hidden size in feed-forward layers.
+    ff_activation: str, "relu" or "gelu".
+    untie_r: bool, whether to untie the biases in attention.
+    n_token: int, the vocab size.
+  """
+  def __init__(self, FLAGS=None, json_path=None, args_dict=None):
+    """Constructing an XLNetConfig.
+    One of FLAGS or json_path should be provided.
+    Args:
+      FLAGS: An FLAGS instance.
+      json_path: A path to a json config file.
+      args_dict: A dict for args.
+    """
+    assert FLAGS is not None or json_path is not None or args_dict is not None
+    self.keys = ['n_layer', 'd_model', 'n_head', 'd_head', 'd_inner',
+                 'ff_activation', 'untie_r', 'n_token']
+    if FLAGS is not None:
+      self.init_from_flags(FLAGS)
+    if json_path is not None:
+      self.init_from_json(json_path)
+    if args_dict is not None:
+      self.init_from_dict(args_dict)
+  def init_from_dict(self, args_dict):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    for key in self.keys:
+      setattr(self, key, args_dict[key])
+  def init_from_flags(self, flags):
+    for key in self.keys:
+      setattr(self, key, getattr(flags, key))
+  def init_from_json(self, json_path):
+    with tf.gfile.Open(json_path) as f:
+      json_data = json.load(f)
+      self.init_from_dict(json_data)
+  def to_json(self, json_path):
+    """Save XLNetConfig to a json file."""
+    json_data = {}
+    for key in self.keys:
+      json_data[key] = getattr(self, key)
+    json_dir = os.path.dirname(json_path)
+    if not tf.gfile.Exists(json_dir):
+      tf.gfile.MakeDirs(json_dir)
+    with tf.gfile.Open(json_path, 'w') as f:
+      json.dump(json_data, f, indent=4, sort_keys=True)
+class RunConfig(object):
+  """Class of RunConfig.
+  RunConfig contains hyperparameters that could be different
+  between pretraining and finetuning.
+  These hyperparameters can also be changed from run to run.
+  We store them separately from XLNetConfig for flexibility.
+  """
+  def __init__(self,
+               is_training,
+               use_tpu,
+               use_bfloat16,
+               dropout,
+               dropout_att,
+               init_method='normal',
+               init_range=0.1,
+               init_std=0.02,
+               mem_len=None,
+               reuse_len=None,
+               bi_data=False,
+               clamp_len=-1,
+               same_length=False):
+    """Initializes RunConfig.
+    Args:
+      is_training: bool, whether in training mode.
+      use_tpu: bool, whether TPUs are used.
+      use_bfloat16: bool, use bfloat16 instead of float32.
+      dropout: float, dropout rate.
+      dropout_att: float, dropout rate on attention probabilities.
+      init_method: str, the initialization scheme, either "normal" or "uniform".
+      init_range: float, initialize the parameters with a uniform distribution
+        in [-init_range, init_range]. Only effective when init="uniform".
+      init_std: float, initialize the parameters with a normal distribution
+        with mean 0 and stddev init_std. Only effective when init="normal".
+      mem_len: int, the number of tokens to cache.
+      reuse_len: int, the number of tokens in the currect batch to be cached
+        and reused in the future.
+      bi_data: bool, whether to use bidirectional input pipeline.
+        Usually set to True during pretraining and False during finetuning.
+      clamp_len: int, clamp all relative distances larger than clamp_len.
+        -1 means no clamping.
+      same_length: bool, whether to use the same attention length
+                   for each token.
+    """
+    self.init_method = init_method
+    self.init_range = init_range
+    self.init_std = init_std
+    self.is_training = is_training
+    self.dropout = dropout
+    self.dropout_att = dropout_att
+    self.use_tpu = use_tpu
+    self.use_bfloat16 = use_bfloat16
+    self.mem_len = mem_len
+    self.reuse_len = reuse_len
+    self.bi_data = bi_data
+    self.clamp_len = clamp_len
+    self.same_length = same_length
--- a/official/nlp/xlnet_modeling.py
+++ b/official/nlp/xlnet_modeling.py
--- a/official/nlp/xlnet_modeling_test.py
+++ b/official/nlp/xlnet_modeling_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import logging
+import numpy as np
+import tensorflow as tf
+from official.nlp import xlnet_modeling
+class PositionalEmbeddingLayerTest(tf.test.TestCase):
+  def test_positional_embedding(self):
+    """A low-dimensional example is tested.
+     With len(pos_seq)=2 and d_model=4:
+       pos_seq  = [[1.], [0.]]
+       inv_freq = [1., 0.01]
+       pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
+       pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
+                  [sin(0.), sin(0.), cos(0.), cos(0.)]]
+               = [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
+                 [0., 0., 1., 1.]]
+    """
+    target = np.array([[[0.84147096, 0.00999983, 0.54030228, 0.99994999]],
+                       [[0., 0., 1., 1.]]])
+    d_model = 4
+    pos_seq = tf.range(1, -1, -1.0)  # [1., 0.]
+    pos_emb_layer = xlnet_modeling.PositionalEmbedding(d_model)
+    pos_emb = pos_emb_layer(
+        pos_seq=pos_seq, batch_size=None).numpy().astype(float)
+    logging.info(pos_emb)
+    self.assertAllClose(pos_emb, target)
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  tf.test.main()