Merge pull request #7 from tensorflow/master

updated

Merge pull request #7 from tensorflow/master
updated
965cc3ee · Ayushman Kumar · GitHub · 1f3247f4 · 1f685c54 · 965cc3ee
Unverified Commit 965cc3ee authored Apr 21, 2020 by Ayushman Kumar Committed by GitHub Apr 21, 2020
20 changed files
--- a/official/nlp/nhnet/multi_channel_attention_test.py
+++ b/official/nlp/nhnet/multi_channel_attention_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nlp.nhnet.multi_channel_attention."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.nhnet import multi_channel_attention
+
+
+class MultiChannelAttentionTest(tf.test.TestCase):
+
+  def test_doc_attention(self):
+    num_heads = 2
+    doc_attention = multi_channel_attention.DocAttention(num_heads, head_size=8)
+    num_docs = 3
+    inputs = np.zeros((2, num_docs, 10, 16), dtype=np.float32)
+    doc_mask = np.zeros((2, num_docs), dtype=np.float32)
+    outputs = doc_attention(inputs, doc_mask)
+    self.assertEqual(outputs.shape, (2, num_docs))
+
+  def test_multi_channel_attention(self):
+    num_heads = 2
+    num_docs = 5
+    attention_layer = multi_channel_attention.MultiChannelAttention(
+        num_heads, head_size=2)
+
+    from_data = 10 * np.random.random_sample((3, 4, 8))
+    to_data = 10 * np.random.random_sample((3, num_docs, 2, 8))
+    mask_data = np.random.randint(2, size=(3, num_docs, 4, 2))
+    doc_probs = np.random.randint(
+        2, size=(3, num_heads, 4, num_docs)).astype(float)
+    outputs = attention_layer([from_data, to_data, mask_data, doc_probs])
+    self.assertEqual(outputs.shape, (3, 4, num_heads, 2))
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/nhnet/optimizer.py
+++ b/official/nlp/nhnet/optimizer.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optimizer and learning rate scheduler."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.modeling.hyperparams import params_dict
+
+
+class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule."""
+
+  def __init__(self, initial_learning_rate, hidden_size, warmup_steps):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: A float, the initial learning rate.
+      hidden_size: An integer, the model dimension in the hidden layers.
+      warmup_steps: An integer, the number of steps required for linear warmup.
+    """
+    super(LearningRateSchedule, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.hidden_size = hidden_size
+    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
+
+  def __call__(self, global_step):
+    """Calculate learning rate with linear warmup and rsqrt decay.
+
+    Args:
+      global_step: An integer, the current global step used for learning rate
+        calculation.
+
+    Returns:
+      A float, the learning rate needs to be used for current global step.
+    """
+    with tf.name_scope('learning_rate_schedule'):
+      global_step = tf.cast(global_step, tf.float32)
+      learning_rate = self.initial_learning_rate
+      learning_rate *= (self.hidden_size**-0.5)
+      # Apply linear warmup
+      learning_rate *= tf.minimum(1.0, global_step / self.warmup_steps)
+      # Apply rsqrt decay
+      learning_rate /= tf.sqrt(tf.maximum(global_step, self.warmup_steps))
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'hidden_size': self.hidden_size,
+        'warmup_steps': self.warmup_steps,
+    }
+
+
+def create_optimizer(params: params_dict.ParamsDict):
+  """Creates optimizer."""
+  lr_schedule = LearningRateSchedule(
+      params.learning_rate,
+      params.hidden_size,
+      params.learning_rate_warmup_steps)
+  return tf.keras.optimizers.Adam(
+      learning_rate=lr_schedule,
+      beta_1=params.adam_beta1,
+      beta_2=params.adam_beta2,
+      epsilon=params.adam_epsilon)
--- a/official/nlp/nhnet/raw_data_process.py
+++ b/official/nlp/nhnet/raw_data_process.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Processes crawled content from news URLs by generating tfrecords."""
+
+import os
+from absl import app
+from absl import flags
+from official.nlp.nhnet import raw_data_processor
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("crawled_articles", "/tmp/nhnet/",
+                    "Folder path to the crawled articles using news-please.")
+flags.DEFINE_string("vocab", None, "Filepath of the BERT vocabulary.")
+flags.DEFINE_bool("do_lower_case", True,
+                  "Whether the vocabulary is uncased or not.")
+flags.DEFINE_integer("len_title", 15,
+                     "Maximum number of tokens in story headline.")
+flags.DEFINE_integer("len_passage", 200,
+                     "Maximum number of tokens in article passage.")
+flags.DEFINE_integer("max_num_articles", 5,
+                     "Maximum number of articles in a story.")
+flags.DEFINE_bool("include_article_title_in_passage", False,
+                  "Whether to include article title in article passage.")
+flags.DEFINE_string("data_folder", None,
+                    "Folder path to the downloaded data folder (output).")
+flags.DEFINE_integer("num_tfrecords_shards", 20,
+                     "Number of shards for train/valid/test.")
+
+
+def transform_as_tfrecords(data_processor, filename):
+  """Transforms story from json to tfrecord (sharded).
+
+  Args:
+    data_processor: Instance of RawDataProcessor.
+    filename: 'train', 'valid', or 'test'.
+  """
+  print("Transforming json to tfrecord for %s..." % filename)
+  story_filepath = os.path.join(FLAGS.data_folder, filename + ".json")
+  output_folder = os.path.join(FLAGS.data_folder, "processed")
+  os.makedirs(output_folder, exist_ok=True)
+  output_filepaths = []
+  for i in range(FLAGS.num_tfrecords_shards):
+    output_filepaths.append(
+        os.path.join(
+            output_folder, "%s.tfrecord-%.5d-of-%.5d" %
+            (filename, i, FLAGS.num_tfrecords_shards)))
+  (total_num_examples,
+   generated_num_examples) = data_processor.generate_examples(
+       story_filepath, output_filepaths)
+  print("For %s, %d examples have been generated from %d stories in json." %
+        (filename, generated_num_examples, total_num_examples))
+
+
+def main(_):
+  if not FLAGS.data_folder:
+    raise ValueError("data_folder must be set as the downloaded folder path.")
+  if not FLAGS.vocab:
+    raise ValueError("vocab must be set as the filepath of BERT vocabulary.")
+  data_processor = raw_data_processor.RawDataProcessor(
+      vocab=FLAGS.vocab,
+      do_lower_case=FLAGS.do_lower_case,
+      len_title=FLAGS.len_title,
+      len_passage=FLAGS.len_passage,
+      max_num_articles=FLAGS.max_num_articles,
+      include_article_title_in_passage=FLAGS.include_article_title_in_passage,
+      include_text_snippet_in_example=True)
+  print("Loading crawled articles...")
+  num_articles = data_processor.read_crawled_articles(FLAGS.crawled_articles)
+  print("Total number of articles loaded: %d" % num_articles)
+  print()
+  transform_as_tfrecords(data_processor, "train")
+  transform_as_tfrecords(data_processor, "valid")
+  transform_as_tfrecords(data_processor, "test")
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/official/nlp/nhnet/raw_data_processor.py
+++ b/official/nlp/nhnet/raw_data_processor.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for processing crawled content and generating tfrecords."""
+
+import collections
+import json
+import multiprocessing
+import os
+import urllib.parse
+import tensorflow as tf
+
+from official.nlp.bert import tokenization
+from official.nlp.data import classifier_data_lib
+
+
+class RawDataProcessor(object):
+  """Data converter for story examples."""
+
+  def __init__(self,
+               vocab: str,
+               do_lower_case: bool,
+               len_title: int = 15,
+               len_passage: int = 200,
+               max_num_articles: int = 5,
+               include_article_title_in_passage: bool = False,
+               include_text_snippet_in_example: bool = False):
+    """Constructs a RawDataProcessor.
+
+    Args:
+      vocab: Filepath of the BERT vocabulary.
+      do_lower_case: Whether the vocabulary is uncased or not.
+      len_title: Maximum number of tokens in story headline.
+      len_passage: Maximum number of tokens in article passage.
+      max_num_articles: Maximum number of articles in a story.
+      include_article_title_in_passage: Whether to include article title in
+        article passage.
+      include_text_snippet_in_example: Whether to include text snippet
+        (headline and article content) in generated tensorflow Examples, for
+        debug usage. If include_article_title_in_passage=True, title and body
+        will be separated by [SEP].
+    """
+    self.articles = dict()
+    self.tokenizer = tokenization.FullTokenizer(
+        vocab, do_lower_case=do_lower_case, split_on_punc=False)
+    self.len_title = len_title
+    self.len_passage = len_passage
+    self.max_num_articles = max_num_articles
+    self.include_article_title_in_passage = include_article_title_in_passage
+    self.include_text_snippet_in_example = include_text_snippet_in_example
+    # ex_index=5 deactivates printing inside convert_single_example.
+    self.ex_index = 5
+    # Parameters used in InputExample, not used in NHNet.
+    self.label = 0
+    self.guid = 0
+    self.num_generated_examples = 0
+
+  def read_crawled_articles(self, folder_path):
+    """Reads crawled articles under folder_path."""
+    for path, _, files in os.walk(folder_path):
+      for name in files:
+        if not name.endswith(".json"):
+          continue
+        url, article = self._get_article_content_from_json(
+            os.path.join(path, name))
+        if not article.text_a:
+          continue
+        self.articles[RawDataProcessor.normalize_url(url)] = article
+        if len(self.articles) % 5000 == 0:
+          print("Number of articles loaded: %d\r" % len(self.articles), end="")
+    print()
+    return len(self.articles)
+
+  def generate_examples(self, input_file, output_files):
+    """Loads story from input json file and exports examples in output_files."""
+    writers = []
+    story_partition = []
+    for output_file in output_files:
+      writers.append(tf.io.TFRecordWriter(output_file))
+      story_partition.append(list())
+    with tf.io.gfile.GFile(input_file, "r") as story_json_file:
+      stories = json.load(story_json_file)
+      writer_index = 0
+      for story in stories:
+        articles = []
+        for url in story["urls"]:
+          normalized_url = RawDataProcessor.normalize_url(url)
+          if normalized_url in self.articles:
+            articles.append(self.articles[normalized_url])
+        if not articles:
+          continue
+        story_partition[writer_index].append((story["label"], articles))
+        writer_index = (writer_index + 1) % len(writers)
+    lock = multiprocessing.Lock()
+    pool = multiprocessing.pool.ThreadPool(len(writers))
+    data = [(story_partition[i], writers[i], lock) for i in range(len(writers))]
+    pool.map(self._write_story_partition, data)
+    return len(stories), self.num_generated_examples
+
+  @classmethod
+  def normalize_url(cls, url):
+    """Normalize url for better matching."""
+    url = urllib.parse.unquote(
+        urllib.parse.urlsplit(url)._replace(query=None).geturl())
+    output, part = [], None
+    for part in url.split("//"):
+      if part == "http:" or part == "https:":
+        continue
+      else:
+        output.append(part)
+    return "//".join(output)
+
+  def _get_article_content_from_json(self, file_path):
+    """Returns (url, InputExample) keeping content extracted from file_path."""
+    with tf.io.gfile.GFile(file_path, "r") as article_json_file:
+      article = json.load(article_json_file)
+      if self.include_article_title_in_passage:
+        return article["url"], classifier_data_lib.InputExample(
+            guid=self.guid,
+            text_a=article["title"],
+            text_b=article["maintext"],
+            label=self.label)
+      else:
+        return article["url"], classifier_data_lib.InputExample(
+            guid=self.guid, text_a=article["maintext"], label=self.label)
+
+  def _write_story_partition(self, data):
+    """Writes stories in a partition into file."""
+    for (story_headline, articles) in data[0]:
+      story_example = tf.train.Example(
+          features=tf.train.Features(
+              feature=self._get_single_story_features(story_headline,
+                                                      articles)))
+      data[1].write(story_example.SerializeToString())
+      data[2].acquire()
+      try:
+        self.num_generated_examples += 1
+        if self.num_generated_examples % 1000 == 0:
+          print(
+              "Number of stories written: %d\r" % self.num_generated_examples,
+              end="")
+      finally:
+        data[2].release()
+
+  def _get_single_story_features(self, story_headline, articles):
+    """Converts a list of articles to a tensorflow Example."""
+    def get_text_snippet(article):
+      if article.text_b:
+        return " [SEP] ".join([article.text_a, article.text_b])
+      else:
+        return article.text_a
+
+    story_features = collections.OrderedDict()
+    story_headline_feature = classifier_data_lib.convert_single_example(
+        ex_index=self.ex_index,
+        example=classifier_data_lib.InputExample(
+            guid=self.guid, text_a=story_headline, label=self.label),
+        label_list=[self.label],
+        max_seq_length=self.len_title,
+        tokenizer=self.tokenizer)
+    if self.include_text_snippet_in_example:
+      story_headline_feature.label_id = story_headline
+    self._add_feature_with_suffix(
+        feature=story_headline_feature,
+        suffix="a",
+        story_features=story_features)
+    for (article_index, article) in enumerate(articles):
+      if article_index == self.max_num_articles:
+        break
+      article_feature = classifier_data_lib.convert_single_example(
+          ex_index=self.ex_index,
+          example=article,
+          label_list=[self.label],
+          max_seq_length=self.len_passage,
+          tokenizer=self.tokenizer)
+      if self.include_text_snippet_in_example:
+        article_feature.label_id = get_text_snippet(article)
+      suffix = chr(ord("b") + article_index)
+      self._add_feature_with_suffix(
+          feature=article_feature, suffix=suffix, story_features=story_features)
+
+    # Adds empty features as placeholder.
+    for article_index in range(len(articles), self.max_num_articles):
+      suffix = chr(ord("b") + article_index)
+      empty_article = classifier_data_lib.InputExample(
+          guid=self.guid, text_a="", label=self.label)
+      empty_feature = classifier_data_lib.convert_single_example(
+          ex_index=self.ex_index,
+          example=empty_article,
+          label_list=[self.label],
+          max_seq_length=self.len_passage,
+          tokenizer=self.tokenizer)
+      if self.include_text_snippet_in_example:
+        empty_feature.label_id = ""
+      self._add_feature_with_suffix(
+          feature=empty_feature, suffix=suffix, story_features=story_features)
+    return story_features
+
+  def _add_feature_with_suffix(self, feature, suffix, story_features):
+    """Appends suffix to feature names and fills in the corresponding values."""
+
+    def _create_int_feature(values):
+      return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+    def _create_string_feature(value):
+      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+    story_features["input_ids_%c" % suffix] = _create_int_feature(
+        feature.input_ids)
+    story_features["input_mask_%c" % suffix] = _create_int_feature(
+        feature.input_mask)
+    story_features["segment_ids_%c" % suffix] = _create_int_feature(
+        feature.segment_ids)
+    if self.include_text_snippet_in_example:
+      story_features["text_snippet_%c" % suffix] = _create_string_feature(
+          bytes(feature.label_id.encode()))
--- a/official/nlp/nhnet/testdata/crawled_articles/domain_0.com/url_000.html
+++ b/official/nlp/nhnet/testdata/crawled_articles/domain_0.com/url_000.html
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Page Title 0</title>
--- a/official/nlp/nhnet/testdata/crawled_articles/domain_0.com/url_000.json
+++ b/official/nlp/nhnet/testdata/crawled_articles/domain_0.com/url_000.json
+{
+  "title": "title for 0",
+  "maintext": "text snippet for 0",
+  "url": "http://url_000.html"
+}
--- a/official/nlp/nhnet/testdata/crawled_articles/domain_1.com/url_001.html
+++ b/official/nlp/nhnet/testdata/crawled_articles/domain_1.com/url_001.html
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Page Title 1</title>
--- a/official/nlp/nhnet/testdata/crawled_articles/domain_1.com/url_001.json
+++ b/official/nlp/nhnet/testdata/crawled_articles/domain_1.com/url_001.json
+{
+  "title": "title for 1",
+  "maintext": "text snippet for 1",
+  "url": "url_001.html"
+}
--- a/official/nlp/nhnet/testdata/stories.json
+++ b/official/nlp/nhnet/testdata/stories.json
+[
+    {
+        "urls": [
+            "http://url_000.html",
+            "http://url_001.html"
+        ],
+        "label": "headline 0"
+    },
+    {
+        "urls": [
+            "http://url_000.html",
+            "http://url_001.html"
+        ],
+        "label": "headline 1"
+    },
+    {
+        "urls": [
+            "http://url_002.html",
+            "http://url_001.html"
+        ],
+        "label": "headline 2"
+    },
+    {
+        "urls": [
+            "http://url_003.html"
+        ],
+        "label": "headline 3"
+    }
+]
--- a/official/nlp/nhnet/testdata/vocab.txt
+++ b/official/nlp/nhnet/testdata/vocab.txt
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+0
+1
+this
+is
+a
+title
+snippet
+for
+url
+main
+text
+http
+www
+html
+:
+//
+.
+_
+headline
--- a/official/nlp/nhnet/trainer.py
+++ b/official/nlp/nhnet/trainer.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run NHNet model training and eval."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from six.moves import zip
+import tensorflow as tf
+from official.modeling.hyperparams import params_dict
+from official.nlp.nhnet import evaluation
+from official.nlp.nhnet import input_pipeline
+from official.nlp.nhnet import models
+from official.nlp.nhnet import optimizer
+from official.nlp.transformer import metrics as transformer_metrics
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+
+
+def define_flags():
+  """Defines command line flags used by NHNet trainer."""
+  ## Required parameters
+  flags.DEFINE_enum("mode", "train", ["train", "eval", "train_and_eval"],
+                    "Execution mode.")
+  flags.DEFINE_string("train_file_pattern", "", "Train file pattern.")
+  flags.DEFINE_string("eval_file_pattern", "", "Eval file pattern.")
+  flags.DEFINE_string(
+      "model_dir", None,
+      "The output directory where the model checkpoints will be written.")
+
+  # Model training specific flags.
+  flags.DEFINE_enum(
+      "distribution_strategy", "mirrored", ["tpu", "mirrored"],
+      "Distribution Strategy type to use for training. `tpu` uses TPUStrategy "
+      "for running on TPUs, `mirrored` uses GPUs with single host.")
+  flags.DEFINE_string("tpu", "", "TPU address to connect to.")
+  flags.DEFINE_string(
+      "init_checkpoint", None,
+      "Initial checkpoint (usually from a pre-trained BERT model).")
+  flags.DEFINE_integer("train_steps", 100000, "Max train steps")
+  flags.DEFINE_integer("eval_steps", 32, "Number of eval steps per run.")
+  flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+  flags.DEFINE_integer("eval_batch_size", 4, "Total batch size for evaluation.")
+  flags.DEFINE_integer(
+      "steps_per_loop", 1000,
+      "Number of steps per graph-mode loop. Only training step "
+      "happens inside the loop.")
+  flags.DEFINE_integer("checkpoint_interval", 2000, "Checkpointing interval.")
+  flags.DEFINE_integer("len_title", 15, "Title length.")
+  flags.DEFINE_integer("len_passage", 200, "Passage length.")
+  flags.DEFINE_integer("num_encoder_layers", 12,
+                       "Number of hidden layers of encoder.")
+  flags.DEFINE_integer("num_decoder_layers", 12,
+                       "Number of hidden layers of decoder.")
+  flags.DEFINE_string("model_type", "nhnet",
+                      "Model type to choose a model configuration.")
+  flags.DEFINE_integer(
+      "num_nhnet_articles", 5,
+      "Maximum number of articles in NHNet, only used when model_type=nhnet")
+  flags.DEFINE_string(
+      "params_override",
+      default=None,
+      help=("a YAML/JSON string or a YAML file which specifies additional "
+            "overrides over the default parameters"))
+
+
+# pylint: disable=protected-access
+
+
+class Trainer(tf.keras.Model):
+  """A training only model."""
+
+  def __init__(self, model, params):
+    super(Trainer, self).__init__()
+    self.model = model
+    self.params = params
+    self._num_replicas_in_sync = tf.distribute.get_strategy(
+    ).num_replicas_in_sync
+
+  def call(self, inputs, mode="train"):
+    return self.model(inputs, mode)
+
+  def train_step(self, inputs):
+    """The logic for one training step."""
+    with tf.GradientTape() as tape:
+      logits, _, _ = self(inputs, mode="train", training=True)
+      targets = models.remove_sos_from_seq(inputs["target_ids"],
+                                           self.params.pad_token_id)
+      loss = transformer_metrics.transformer_loss(logits, targets,
+                                                  self.params.label_smoothing,
+                                                  self.params.vocab_size)
+      # Scales the loss, which results in using the average loss across all
+      # of the replicas for backprop.
+      scaled_loss = loss / self._num_replicas_in_sync
+
+    tvars = self.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    self.optimizer.apply_gradients(list(zip(grads, tvars)))
+    return {
+        "training_loss": loss,
+        "learning_rate": self.optimizer._decayed_lr(var_dtype=tf.float32)
+    }
+
+
+def train(params, strategy, dataset=None):
+  """Runs training."""
+
+  if not dataset:
+    dataset = input_pipeline.get_input_dataset(
+        FLAGS.train_file_pattern,
+        FLAGS.train_batch_size,
+        params,
+        is_training=True,
+        strategy=strategy)
+
+  with strategy.scope():
+    model = models.create_model(
+        FLAGS.model_type, params, init_checkpoint=FLAGS.init_checkpoint)
+    opt = optimizer.create_optimizer(params)
+    trainer = Trainer(model, params)
+    model.global_step = opt.iterations
+
+    trainer.compile(
+        optimizer=opt,
+        experimental_steps_per_execution=FLAGS.steps_per_loop)
+    summary_dir = os.path.join(FLAGS.model_dir, "summaries")
+    summary_callback = tf.keras.callbacks.TensorBoard(
+        summary_dir, update_freq=max(100, FLAGS.steps_per_loop))
+    checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        directory=FLAGS.model_dir,
+        max_to_keep=10,
+        step_counter=model.global_step,
+        checkpoint_interval=FLAGS.checkpoint_interval)
+    if checkpoint_manager.restore_or_initialize():
+      logging.info("Training restored from the checkpoints in: %s",
+                   FLAGS.model_dir)
+    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
+
+  # Trains the model.
+  steps_per_epoch = min(FLAGS.train_steps, FLAGS.checkpoint_interval)
+  epochs = FLAGS.train_steps // steps_per_epoch
+  trainer.fit(
+      x=dataset,
+      steps_per_epoch=steps_per_epoch,
+      epochs=epochs,
+      callbacks=[summary_callback, checkpoint_callback],
+      verbose=2)
+
+
+def run():
+  """Runs NHNet using Keras APIs."""
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy, tpu_address=FLAGS.tpu)
+  if strategy:
+    logging.info("***** Number of cores used : %d",
+                 strategy.num_replicas_in_sync)
+
+  params = models.get_model_params(FLAGS.model_type)
+  params = params_dict.override_params_dict(
+      params, FLAGS.params_override, is_strict=True)
+  params.override(
+      {
+          "len_title":
+              FLAGS.len_title,
+          "len_passage":
+              FLAGS.len_passage,
+          "num_hidden_layers":
+              FLAGS.num_encoder_layers,
+          "num_decoder_layers":
+              FLAGS.num_decoder_layers,
+          "passage_list":
+              [chr(ord("b") + i) for i in range(FLAGS.num_nhnet_articles)],
+      },
+      is_strict=False)
+  stats = {}
+  if "train" in FLAGS.mode:
+    train(params, strategy)
+  if "eval" in FLAGS.mode:
+    timeout = 0 if FLAGS.mode == "train_and_eval" else 3000
+    # Uses padded decoding for TPU. Always uses cache.
+    padded_decode = isinstance(strategy, tf.distribute.experimental.TPUStrategy)
+    params.override({
+        "padded_decode": padded_decode,
+    }, is_strict=False)
+    stats = evaluation.continuous_eval(
+        strategy,
+        params,
+        model_type=FLAGS.model_type,
+        eval_file_pattern=FLAGS.eval_file_pattern,
+        batch_size=FLAGS.eval_batch_size,
+        eval_steps=FLAGS.eval_steps,
+        model_dir=FLAGS.model_dir,
+        timeout=timeout)
+  return stats
+
+
+def main(_):
+  stats = run()
+  if stats:
+    logging.info("Stats:\n%s", stats)
+
+if __name__ == "__main__":
+  define_flags()
+  app.run(main)
--- a/official/nlp/nhnet/trainer_test.py
+++ b/official/nlp/nhnet/trainer_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.nhnet.trainer."""
+
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import tensorflow as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+# pylint: enable=g-direct-tensorflow-import
+from official.nlp.nhnet import trainer
+from official.nlp.nhnet import utils
+
+FLAGS = flags.FLAGS
+trainer.define_flags()
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.one_device_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.tpu_strategy,
+      ],
+      mode="eager",
+  )
+
+
+def get_trivial_data(config) -> tf.data.Dataset:
+  """Gets trivial data in the ImageNet size."""
+  batch_size, num_docs = 2, len(config.passage_list),
+  len_passage = config.len_passage
+  len_title = config.len_title
+  def generate_data(_) -> tf.data.Dataset:
+    fake_ids = tf.zeros((num_docs, len_passage), dtype=tf.int32)
+    title = tf.zeros((len_title), dtype=tf.int32)
+    return dict(
+        input_ids=fake_ids,
+        input_mask=fake_ids,
+        segment_ids=fake_ids,
+        target_ids=title)
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(generate_data,
+                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.prefetch(buffer_size=1).batch(batch_size)
+  return dataset
+
+
+class TrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TrainerTest, self).setUp()
+    self._config = utils.get_test_params()
+    self._config.override(
+        {
+            "vocab_size": 49911,
+            "max_position_embeddings": 200,
+            "len_title": 15,
+            "len_passage": 20,
+            "beam_size": 5,
+            "alpha": 0.6,
+            "learning_rate": 0.0,
+            "learning_rate_warmup_steps": 0,
+            "multi_channel_cross_attention": True,
+            "passage_list": ["a", "b"],
+        },
+        is_strict=False)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_train(self, distribution):
+    FLAGS.train_steps = 10
+    FLAGS.checkpoint_interval = 5
+    FLAGS.model_dir = self.get_temp_dir()
+    FLAGS.model_type = "nhnet"
+    trainer.train(self._config, distribution, get_trivial_data(self._config))
+    self.assertLen(
+        tf.io.gfile.glob(os.path.join(FLAGS.model_dir, "ckpt*.index")), 2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/nhnet/utils.py
+++ b/official/nlp/nhnet/utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility helpers for Bert2Bert."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl import logging
+import tensorflow as tf
+from typing import Optional, Text
+from official.modeling.hyperparams import params_dict
+from official.nlp.bert import configs
+from official.nlp.nhnet import configs as nhnet_configs
+
+
+def get_bert_config_from_params(
+    params: params_dict.ParamsDict) -> configs.BertConfig:
+  """Converts a BertConfig to ParamsDict."""
+  return configs.BertConfig.from_dict(params.as_dict())
+
+
+def get_test_params(cls=nhnet_configs.BERT2BERTConfig):
+  return cls.from_args(**nhnet_configs.UNITTEST_CONFIG)
+
+
+# pylint: disable=protected-access
+def encoder_common_layers(transformer_block):
+  return [
+      transformer_block._attention_layer,
+      transformer_block._attention_output_dense,
+      transformer_block._attention_layer_norm,
+      transformer_block._intermediate_dense, transformer_block._output_dense,
+      transformer_block._output_layer_norm
+  ]
+# pylint: enable=protected-access
+
+
+def initialize_bert2bert_from_pretrained_bert(
+    bert_encoder: tf.keras.layers.Layer,
+    bert_decoder: tf.keras.layers.Layer,
+    init_checkpoint: Optional[Text] = None) -> None:
+  """Helper function to initialze Bert2Bert from Bert pretrained checkpoint."""
+  ckpt = tf.train.Checkpoint(model=bert_encoder)
+  logging.info(
+      "Checkpoint file %s found and restoring from "
+      "initial checkpoint for core model.", init_checkpoint)
+  status = ckpt.restore(init_checkpoint)
+
+  # Expects the bert model is a subset of checkpoint as pooling layer is
+  # not used.
+  status.assert_existing_objects_matched()
+  logging.info("Loading from checkpoint file completed.")
+
+  # Saves a checkpoint with transformer layers.
+  encoder_layers = []
+  for transformer_block in bert_encoder.transformer_layers:
+    encoder_layers.extend(encoder_common_layers(transformer_block))
+
+  # Restores from the checkpoint with encoder layers.
+  decoder_layers_to_initialize = []
+  for decoder_block in bert_decoder.decoder.layers:
+    decoder_layers_to_initialize.extend(
+        decoder_block.common_layers_with_encoder())
+
+  if len(decoder_layers_to_initialize) != len(encoder_layers):
+    raise ValueError(
+        "Source encoder layers with %d objects does not match destination "
+        "decoder layers with %d objects." %
+        (len(decoder_layers_to_initialize), len(encoder_layers)))
+
+  for dest_layer, source_layer in zip(decoder_layers_to_initialize,
+                                      encoder_layers):
+    try:
+      dest_layer.set_weights(source_layer.get_weights())
+    except ValueError as e:
+      logging.error(
+          "dest_layer: %s failed to set weights from "
+          "source_layer: %s as %s", dest_layer.name, source_layer.name, str(e))
--- a/official/nlp/optimization.py
+++ b/official/nlp/optimization.py
@@ -20,19 +20,20 @@ from __future__ import print_function

 import re

+from absl import logging
 import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers


 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
  """Applies a warmup schedule on a given learning rate decay schedule."""

-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_schedule_fn,
-      warmup_steps,
-      power=1.0,
-      name=None):
+  def __init__(self,
+               initial_learning_rate,
+               decay_schedule_fn,
+               warmup_steps,
+               power=1.0,
+               name=None):
    super(WarmUp, self).__init__()
    self.initial_learning_rate = initial_learning_rate
    self.warmup_steps = warmup_steps
@@ -50,10 +51,11 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
      warmup_learning_rate = (
          self.initial_learning_rate *
          tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(global_step_float < warmup_steps_float,
-                     lambda: warmup_learning_rate,
-                     lambda: self.decay_schedule_fn(step),
-                     name=name)
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: self.decay_schedule_fn(step),
+          name=name)

  def get_config(self):
    return {
@@ -65,24 +67,44 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    }


-def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
+def create_optimizer(init_lr,
+                     num_train_steps,
+                     num_warmup_steps,
+                     end_lr=0.0,
+                     optimizer_type='adamw'):
  """Creates an optimizer with learning rate schedule."""
  # Implements linear decay of the learning rate.
-  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=init_lr,
      decay_steps=num_train_steps,
-      end_learning_rate=0.0)
+      end_learning_rate=end_lr)
  if num_warmup_steps:
-    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
-                              decay_schedule_fn=learning_rate_fn,
-                              warmup_steps=num_warmup_steps)
-  optimizer = AdamWeightDecay(
-      learning_rate=learning_rate_fn,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=['layer_norm', 'bias'])
+    lr_schedule = WarmUp(
+        initial_learning_rate=init_lr,
+        decay_schedule_fn=lr_schedule,
+        warmup_steps=num_warmup_steps)
+
+  if optimizer_type == 'adamw':
+    logging.info('using Adamw optimizer')
+    optimizer = AdamWeightDecay(
+        learning_rate=lr_schedule,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=['layer_norm', 'bias'])
+  elif optimizer_type == 'lamb':
+    logging.info('using Lamb optimizer')
+    optimizer = tfa_optimizers.LAMB(
+        learning_rate=lr_schedule,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=['layer_norm', 'bias'])
+  else:
+    raise ValueError('Unsupported optimizer type: ', optimizer_type)
+
  return optimizer


@@ -109,8 +131,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
               exclude_from_weight_decay=None,
               name='AdamWeightDecay',
               **kwargs):
-    super(AdamWeightDecay, self).__init__(
-        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
+                                          epsilon, amsgrad, name, **kwargs)
    self.weight_decay_rate = weight_decay_rate
    self._include_in_weight_decay = include_in_weight_decay
    self._exclude_from_weight_decay = exclude_from_weight_decay
@@ -171,15 +193,15 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
    decay = self._decay_weights_op(var, lr_t, apply_state)
    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_dense(
-          grad, var, **kwargs)
+      return super(AdamWeightDecay,
+                   self)._resource_apply_dense(grad, var, **kwargs)

  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
    decay = self._decay_weights_op(var, lr_t, apply_state)
    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_sparse(
-          grad, var, indices, **kwargs)
+      return super(AdamWeightDecay,
+                   self)._resource_apply_sparse(grad, var, indices, **kwargs)

  def get_config(self):
    config = super(AdamWeightDecay, self).get_config()

--- a/official/nlp/xlnet/xlnet_modeling.py
+++ b/official/nlp/xlnet/xlnet_modeling.py
@@ -1019,7 +1019,8 @@ class Summarization(tf.keras.layers.Layer):
      summary = inputs[0]
    else:
      raise ValueError('Invalid summary type provided: %s' % self.summary_type)
-    summary = self.proj_layer(summary)
+    if self.use_proj:
+      summary = self.proj_layer(summary)
    summary = self.dropout_layer(summary)
    return summary


--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -78,6 +78,7 @@ setup(
        'official.r1*',
        'official.pip_package*',
        'official.benchmark*',
+        'official.colab*',
    ]),
    exclude_package_data={
        '': ['*_test.py',],

--- a/official/r1/README.md
+++ b/official/r1/README.md
-# Legacy Models Collection
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)

-The R1 folder contains legacy model implmentation and models that will not
-update to TensorFlow 2.x. They do not have solid performance tracking.
+# Legacy Models

-**Note: models will be removed from the master branch by 2020/06.**
+The **r1** folder contains legacy model implementations developed
+using TensorFlow 1.x.

-After removal, you can still access to these legacy models in the previous
-released tags, e.g. [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0).
+**Note: We will remove this r1 folder from the master branch in June, 2020.**

+After removal, you will still be able to access legacy models
+in the previous releases.
+(e.g., [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0))

-## Legacy model implmentation
-
-Transformer and MNIST implementation uses pure TF 1.x TF-Estimator.
-Users should follow the corresponding TF 2.x implmentation inside the
-official model garden.
-
-## Models that will not update to TensorFlow 2.x
-
-*   [boosted_trees](boosted_trees): A Gradient Boosted Trees model to
-    classify higgs boson process from HIGGS Data Set.
-*   [wide_deep](wide_deep): A model that combines a wide model and deep
-    network to classify census income data.
+| Model | Description | Reference |
+| ----- | ----------- | --------- |
+| [Gradient Boosted Trees](boosted_trees) | A gradient boosted trees model to classify higgs boson process from HIGGS dataset | [Link](https://en.wikipedia.org/wiki/Gradient_boosting) |
+| [MNIST](mnist) | A basic model to classify digits from the MNIST dataset | [Link](http://yann.lecun.com/exdb/mnist/) |
+| [NCF](ncf) | NCF Estimator implementation | [arXiv:1708.05031](https://arxiv.org/abs/1708.05031) |
+| [ResNet](resnet) | A deep residual network for image recognition | [arXiv:1512.03385](https://arxiv.org/abs/1512.03385) |
+| [Transformer](transformer) | A transformer model to translate the WMT English to German dataset | [arXiv:1706.03762](https://arxiv.org/abs/1706.03762) |
+| [Wide & Deep Learning](wide_deep) | A model that combines a wide linear model and deep neural network for recommender systems | [arXiv:1606.07792](https://arxiv.org/abs/1606.07792) |
--- a/official/r1/boosted_trees/README.md
+++ b/official/r1/boosted_trees/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
+
 # Classifying Higgs boson processes in the HIGGS Data Set
+
 ## Overview
 The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) contains 11 million samples with 28 features, and is for the classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.


--- a/official/r1/mnist/README.md
+++ b/official/r1/mnist/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
+
 # MNIST in TensorFlow

 This directory builds a convolutional neural net to classify the [MNIST

--- a/official/r1/ncf/README.md
+++ b/official/r1/ncf/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
+
+# NCF Estimator implementation
+
+NCF framework to train and evaluate the NeuMF model