Merge pull request #10514 from ZihanWangKi:master

PiperOrigin-RevId: 436548878

Merge pull request #10514 from ZihanWangKi:master
PiperOrigin-RevId: 436548878
36d6f41f · saberkun · bc8b6332 · 259c4347 · 36d6f41f · 36d6f41f
Commit 36d6f41f authored Mar 22, 2022 by saberkun
20 changed files
--- a/official/projects/longformer/longformer_encoder_block.py
+++ b/official/projects/longformer/longformer_encoder_block.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Longformer attention layer. Modified From huggingface/transformers."""
+import tensorflow as tf
+from official.projects.longformer.longformer_attention import LongformerAttention
+@tf.keras.utils.register_keras_serializable(package="Text")
+class LongformerEncoderBlock(tf.keras.layers.Layer):
+  """LongformerEncoderBlock.
+    Args:
+      num_attention_heads: Number of attention heads.
+      inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network.
+      inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network.
+      output_range: the sequence output range, [0, output_range) for slicing the
+        target sequence. `None` means the target sequence is not sliced.
+      kernel_initializer: Initializer for dense layer kernels.
+      bias_initializer: Initializer for dense layer biases.
+      kernel_regularizer: Regularizer for dense layer kernels.
+      bias_regularizer: Regularizer for dense layer biases.
+      activity_regularizer: Regularizer for dense layer activity.
+      kernel_constraint: Constraint for dense layer kernels.
+      bias_constraint: Constraint for dense layer kernels.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      output_dropout: Dropout probability for the post-attention and output
+        dropout.
+      attention_dropout: Dropout probability for within the attention layer.
+      inner_dropout: Dropout probability for the first Dense layer in a
+        two-layer feedforward network.
+      attention_initializer: Initializer for kernels of attention layers. If set
+        `None`, attention layers use kernel_initializer as initializer for
+        kernel.
+      attention_axes: axes over which the attention is applied. `None` means
+        attention over all axes, but batch, heads, and features.
+      **kwargs: keyword arguments/
+  """
+  def __init__(
+      self,
+      global_attention_size,
+      num_attention_heads,
+      inner_dim,
+      inner_activation,
+      # Longformer
+      attention_window,
+      layer_id=0,
+      output_range=None,
+      kernel_initializer="glorot_uniform",
+      bias_initializer="zeros",
+      kernel_regularizer=None,
+      bias_regularizer=None,
+      activity_regularizer=None,
+      kernel_constraint=None,
+      bias_constraint=None,
+      use_bias=True,
+      norm_first=False,
+      norm_epsilon=1e-12,
+      output_dropout=0.0,
+      attention_dropout=0.0,
+      inner_dropout=0.0,
+      attention_initializer=None,
+      attention_axes=None,
+      **kwargs):
+    super().__init__(**kwargs)
+    self.global_attention_size = global_attention_size
+    self._num_heads = num_attention_heads
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
+    # Longformer
+    self._attention_window = attention_window
+    self._layer_id = layer_id
+    self._attention_dropout = attention_dropout
+    self._attention_dropout_rate = attention_dropout
+    self._output_dropout = output_dropout
+    self._output_dropout_rate = output_dropout
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._inner_dropout = inner_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+    self._attention_axes = attention_axes
+  def build(self, input_shape):
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          f"The type of input shape argument is not supported, got: "
+          f"{type(input_shape)}")
+    einsum_equation = "abc,cd->abd"
+    if len(input_tensor_shape.as_list()) > 3:
+      einsum_equation = "...bc,cd->...bd"
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          f"The input size ({hidden_size}) is not a multiple of the number of attention "
+          f"heads ({self._num_heads})")
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    # TFLongformerSelfAttention + TFLongformerSelfOutput.dense
+    self._attention_layer = LongformerAttention(
+        # Longformer
+        layer_id=self._layer_id,
+        global_attention_size=self.global_attention_size,
+        attention_window=self._attention_window,
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        attention_axes=self._attention_axes,
+        name="self_attention",
+        **common_kwargs)
+    # TFLongformerSelfOutput.dropout
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    # TFLongformerSelfOutput.Layernorm
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+    # TFLongformerIntermediate
+    # TFLongformerIntermediate.dense
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, self._inner_dim),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    # TFLongformerIntermediate.intermediate_act_fn
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation, dtype=policy)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    # TFLongformerOutput
+    # TFLongformerOutput.dense
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer,
+        **common_kwargs)
+    # TFLongformerOutput.dropout
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # TFLongformerOutput.layernorm
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+    super().build(input_shape)
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
+        "output_dropout":
+            self._output_dropout_rate,
+        "attention_dropout":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer),
+        "attention_axes":
+            self._attention_axes,
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    """Transformer self-attention encoder block call.
+    Args:
+      inputs: a single tensor or a list of tensors. `input tensor` as the single
+        sequence of embeddings. [`input tensor`, `attention mask`] to have the
+        additional attention mask. [`query tensor`, `key value tensor`,
+        `attention mask`] to have separate input streams for the query, and
+        key/value to the multi-head attention.
+    Returns:
+      An output tensor with the same dimensions as input/query tensor.
+    """
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 4:
+        (
+            input_tensor,
+            attention_mask,
+            is_index_masked,
+            is_index_global_attn,
+        ) = inputs
+        key_value = None
+      elif len(inputs) == 5:
+        assert False  # No key_value
+      else:
+        raise ValueError(
+            f"Unexpected inputs to {self.__class__} with length at {len(inputs)}"
+        )
+    else:
+      input_tensor = inputs
+      attention_mask = None
+      is_index_masked = None
+      is_index_global_attn = None
+      key_value = None
+    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
+      if is_index_masked is not None:
+        is_index_masked = is_index_masked[:, 0:self._output_range]
+      if is_index_global_attn is not None:
+        is_index_global_attn = is_index_global_attn[:, 0:self._output_range]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor
+    if key_value is None:
+      key_value = input_tensor
+    attention_output = self._attention_layer(
+        hidden_states=target_tensor,
+        attention_mask=attention_mask,
+        is_index_masked=is_index_masked,
+        is_index_global_attn=is_index_global_attn,
+    )
+    # TFLongformerAttention.TFLongformerSelfOutput.* - {.dense}
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+    # TFLongformerIntermediate
+    inner_output = self._intermediate_dense(attention_output)
+    inner_output = self._intermediate_activation_layer(inner_output)
+    inner_output = self._inner_dropout_layer(inner_output)
+    # TFLongformerOutput
+    layer_output = self._output_dense(inner_output)
+    layer_output = self._output_dropout(layer_output)
+    if self._norm_first:
+      return source_attention_output + layer_output
+    # During mixed precision training, layer norm output is always fp32 for now.
+    # Casts fp32 for the subsequent add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    return self._output_layer_norm(layer_output + attention_output)
--- a/official/projects/longformer/longformer_encoder_test.py
+++ b/official/projects/longformer/longformer_encoder_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.projects.longformer.longformer_encoder."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from official.projects.longformer.longformer_encoder import LongformerEncoder
+class LongformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
+  def setUp(self):
+    super(LongformerEncoderTest, self).setUp()
+    np.random.seed(0)
+    tf.random.set_seed(0)
+  @combinations.generate(
+      combinations.combine(
+          attention_window=[32, 128], global_attention_size=[0, 1, 2]))
+  def test_encoder(self, attention_window, global_attention_size):
+    sequence_length = 128
+    batch_size = 2
+    vocab_size = 1024
+    hidden_size = 256
+    network = LongformerEncoder(
+        global_attention_size=global_attention_size,
+        vocab_size=vocab_size,
+        attention_window=[attention_window],
+        hidden_size=hidden_size,
+        num_layers=1,
+        num_attention_heads=4,
+        max_sequence_length=512)
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length), dtype=np.int32)
+    type_id_data = np.random.randint(
+        2, size=(batch_size, sequence_length), dtype=np.int32)
+    inputs = {
+        'input_word_ids': word_id_data,
+        'input_mask': mask_data,
+        'input_type_ids': type_id_data,
+    }
+    outputs = network(inputs)
+    self.assertEqual(outputs['sequence_output'].shape,
+                     (batch_size, sequence_length, hidden_size))
+  @combinations.generate(
+      combinations.combine(
+          norm_first=[True, False], global_attention_size=[0, 1, 2]))
+  def test_norm_first(self, norm_first, global_attention_size):
+    sequence_length = 128
+    batch_size = 2
+    vocab_size = 1024
+    hidden_size = 256
+    network = LongformerEncoder(
+        global_attention_size=global_attention_size,
+        vocab_size=vocab_size,
+        attention_window=[32],
+        hidden_size=hidden_size,
+        num_layers=1,
+        num_attention_heads=4,
+        max_sequence_length=512,
+        norm_first=norm_first)
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length), dtype=np.int32)
+    type_id_data = np.random.randint(
+        2, size=(batch_size, sequence_length), dtype=np.int32)
+    inputs = {
+        'input_word_ids': word_id_data,
+        'input_mask': mask_data,
+        'input_type_ids': type_id_data,
+    }
+    outputs = network(inputs)
+    self.assertEqual(outputs['sequence_output'].shape,
+                     (batch_size, sequence_length, hidden_size))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/longformer/longformer_experiments.py
+++ b/official/projects/longformer/longformer_experiments.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Longformer experiments."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+import dataclasses
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.data import sentence_prediction_dataloader
+from official.nlp.tasks import masked_lm
+from official.nlp.tasks import sentence_prediction
+from official.projects.longformer.longformer import LongformerEncoderConfig
+AdamWeightDecay = optimization.AdamWeightDecayConfig
+PolynomialLr = optimization.PolynomialLrConfig
+PolynomialWarmupConfig = optimization.PolynomialWarmupConfig
+@dataclasses.dataclass
+class LongformerOptimizationConfig(optimization.OptimizationConfig):
+  """Longformer optimization configuration."""
+  optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
+      type='adamw',
+      adamw=AdamWeightDecay(
+          weight_decay_rate=0.01,
+          exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
+          epsilon=1e-6))
+  learning_rate: optimization.LrConfig = optimization.LrConfig(
+      type='polynomial',
+      polynomial=PolynomialLr(
+          initial_learning_rate=1e-4,
+          decay_steps=1000000,
+          end_learning_rate=0.0))
+  warmup: optimization.WarmupConfig = optimization.WarmupConfig(
+      type='polynomial', polynomial=PolynomialWarmupConfig(warmup_steps=10000))
+@exp_factory.register_config_factory('longformer/pretraining')
+def longformer_pretraining() -> cfg.ExperimentConfig:
+  """Longformer pretraining experiment."""
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(enable_xla=True),
+      task=masked_lm.MaskedLMConfig(
+          model=bert.PretrainerConfig(
+              encoder=encoders.EncoderConfig(
+                  type='any', any=LongformerEncoderConfig()),
+              cls_heads=[
+                  bert.ClsHeadConfig(
+                      inner_dim=768,
+                      num_classes=2,
+                      dropout_rate=0.1,
+                      name='next_sentence')
+              ]),
+          train_data=pretrain_dataloader.BertPretrainDataConfig(
+              use_v2_feature_names=True),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              use_v2_feature_names=True, is_training=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=LongformerOptimizationConfig(), train_steps=1000000),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+@exp_factory.register_config_factory('longformer/glue')
+def longformer_glue() -> cfg.ExperimentConfig:
+  """Longformer glue fine-tuning."""
+  config = cfg.ExperimentConfig(
+      task=sentence_prediction.SentencePredictionConfig(
+          model=sentence_prediction.ModelConfig(
+              encoder=encoders.EncoderConfig(
+                  type='any', any=LongformerEncoderConfig())),
+          train_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(),
+          validation_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(
+              is_training=False, drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 3e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
--- a/official/projects/longformer/train.py
+++ b/official/projects/longformer/train.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A customized training library for the specific task."""
+from absl import app
+from absl import flags
+import gin
+from official.common import distribute_utils
+from official.common import flags as tfm_flags
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling import performance
+from official.projects.longformer import longformer_experiments  # pylint: disable=unused-import
+FLAGS = flags.FLAGS
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu,
+      **params.runtime.model_parallelism())
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
--- a/official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py
+++ b/official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Converts pre-trained pytorch checkpoint into a tf encoder checkpoint."""
+import os
+from absl import app
+import numpy as np
+import tensorflow as tf
+import transformers
+from official.modeling import tf_utils
+from official.projects.longformer.longformer import LongformerEncoderConfig
+from official.projects.longformer.longformer_encoder import LongformerEncoder
+def _get_pytorch_longformer_model():
+  pretrained_lm = "allenai/longformer-base-4096"
+  model = transformers.AutoModel.from_pretrained(pretrained_lm)
+  return {n: p.data.numpy() for n, p in model.named_parameters()}
+def _create_longformer_model():
+  """Creates a Longformer model."""
+  encoder_cfg = LongformerEncoderConfig
+  encoder_cfg.vocab_size = 50265
+  encoder_cfg.max_position_embeddings = 4098
+  encoder_cfg.attention_window = [2] * encoder_cfg.num_layers
+  encoder_cfg.global_attention_size = 1
+  encoder = LongformerEncoder(
+      attention_window=encoder_cfg.attention_window,
+      global_attention_size=encoder_cfg.global_attention_size,
+      vocab_size=encoder_cfg.vocab_size,
+      hidden_size=encoder_cfg.hidden_size,
+      num_layers=encoder_cfg.num_layers,
+      num_attention_heads=encoder_cfg.num_attention_heads,
+      inner_dim=encoder_cfg.intermediate_size,
+      inner_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+      output_dropout=encoder_cfg.dropout_rate,
+      attention_dropout=encoder_cfg.attention_dropout_rate,
+      max_sequence_length=encoder_cfg.max_position_embeddings,
+      type_vocab_size=encoder_cfg.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=encoder_cfg.initializer_range),
+      output_range=encoder_cfg.output_range,
+      embedding_width=encoder_cfg.embedding_size,
+      norm_first=encoder_cfg.norm_first)
+  return encoder
+# pylint: disable=protected-access
+def convert(encoder, allenai_model):
+  """Convert AllenAI Longformer to the one in the codebase."""
+  num_layers = encoder._config["num_layers"]
+  num_attention_heads = encoder._config["num_attention_heads"]
+  hidden_size = encoder._config["hidden_size"]
+  head_size = hidden_size // num_attention_heads
+  assert head_size * num_attention_heads == hidden_size
+  encoder._embedding_layer.set_weights(
+      [allenai_model["embeddings.word_embeddings.weight"]])
+  encoder._embedding_norm_layer.set_weights([
+      allenai_model["embeddings.LayerNorm.weight"],
+      allenai_model["embeddings.LayerNorm.bias"]
+  ])
+  encoder._type_embedding_layer.set_weights([
+      np.repeat(
+          allenai_model["embeddings.token_type_embeddings.weight"], 2, axis=0)
+  ])
+  encoder._position_embedding_layer.set_weights(
+      [allenai_model["embeddings.position_embeddings.weight"]])
+  encoder._pooler_layer.set_weights([
+      allenai_model["pooler.dense.weight"], allenai_model["pooler.dense.bias"]
+  ])
+  for layer_num in range(num_layers):
+    encoder._transformer_layers[
+        layer_num]._attention_layer._global_key_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T
+            .reshape(
+                (hidden_size, num_attention_heads, head_size)), allenai_model[
+                    f"encoder.layer.{layer_num}.attention.self.key_global.bias"]
+            .reshape((num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._global_query_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.query_global.weight"]
+            .T.reshape((hidden_size, num_attention_heads, head_size)),
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.query_global.bias"]
+            .reshape((num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._global_value_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.value_global.weight"]
+            .T.reshape((hidden_size, num_attention_heads, head_size)),
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.value_global.bias"]
+            .reshape((num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._key_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.key.weight"].T
+            .reshape(
+                (hidden_size, num_attention_heads, head_size)), allenai_model[
+                    f"encoder.layer.{layer_num}.attention.self.key_global.bias"]
+            .reshape((num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._query_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.query.weight"].T
+            .reshape((hidden_size, num_attention_heads, head_size)),
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape(
+                    (num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._value_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.value.weight"].T
+            .reshape((hidden_size, num_attention_heads, head_size)),
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape(
+                    (num_attention_heads, head_size))
+        ])
+    encoder._transformer_layers[
+        layer_num]._attention_layer._output_dense.set_weights([
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.output.dense.weight"].T,
+            allenai_model[
+                f"encoder.layer.{layer_num}.attention.output.dense.bias"]
+        ])
+    encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights([
+        allenai_model[
+            f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"],
+        allenai_model[
+            f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]
+    ])
+    encoder._transformer_layers[layer_num]._intermediate_dense.set_weights([
+        allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T,
+        allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]
+    ])
+    encoder._transformer_layers[layer_num]._output_dense.set_weights([
+        allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T,
+        allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]
+    ])
+    encoder._transformer_layers[layer_num]._output_layer_norm.set_weights([
+        allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"],
+        allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]
+    ])
+def convert_checkpoint(output_path):
+  """Converts and save the checkpoint."""
+  output_dir, _ = os.path.split(output_path)
+  tf.io.gfile.makedirs(output_dir)
+  encoder = _create_longformer_model()
+  allenai_model = _get_pytorch_longformer_model()
+  sequence_length = 128
+  batch_size = 2
+  word_id_data = np.random.randint(
+      10, size=(batch_size, sequence_length), dtype=np.int32)
+  mask_data = np.random.randint(
+      2, size=(batch_size, sequence_length), dtype=np.int32)
+  type_id_data = np.random.randint(
+      2, size=(batch_size, sequence_length), dtype=np.int32)
+  inputs = {
+      "input_word_ids": word_id_data,
+      "input_mask": mask_data,
+      "input_type_ids": type_id_data,
+  }
+  encoder(inputs)
+  convert(encoder, allenai_model)
+  tf.train.Checkpoint(encoder=encoder).write(output_path)
+def main(_):
+  convert_checkpoint("longformer-4096/longformer")
+if __name__ == "__main__":
+  app.run(main)
--- a/official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py
+++ b/official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Longformer training examples to Tfrecord."""
+import collections
+import os
+import datasets
+import tensorflow as tf
+import transformers
+pretrained_lm = "allenai/longformer-base-4096"
+task_name = "mnli"
+save_path = "./"
+raw_datasets = datasets.load_dataset("glue", task_name, cache_dir=None)
+label_list = raw_datasets["train"].features["label"].names
+num_labels = len(label_list)
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    pretrained_lm,
+    use_fast=True,
+)
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+sentence1_key, sentence2_key = task_to_keys[task_name]
+padding = "max_length"
+# make sure this is the same with model input size.
+max_seq_length = 512
+def preprocess_function(examples):
+  # Tokenize the texts
+  args = ((examples[sentence1_key],) if sentence2_key is None else
+          (examples[sentence1_key], examples[sentence2_key]))
+  result = tokenizer(
+      *args, padding=padding, max_length=max_seq_length, truncation=True)
+  return result
+raw_datasets = raw_datasets.map(
+    preprocess_function,
+    batched=True,
+    desc="Running tokenizer on dataset",
+)
+train_dataset = raw_datasets["train"]
+eval_dataset = raw_datasets["validation_matched" if task_name ==
+                            "mnli" else "validation"]
+print("train_dataset", train_dataset[0])
+print("eval_dataset", eval_dataset[0])
+def file_based_convert_examples_to_features(examples, output_file):
+  """Convert a set of `InputExample`s to a TFRecord file."""
+  tf.io.gfile.makedirs(os.path.dirname(output_file))
+  writer = tf.io.TFRecordWriter(output_file)
+  for ex_index, example in enumerate(examples):
+    if ex_index % 10000 == 0:
+      print(f"Writing example {ex_index} of {len(examples)}")
+    def create_int_feature(values):
+      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+      return f
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(example["input_ids"])
+    features["input_mask"] = create_int_feature(example["attention_mask"])
+    features["segment_ids"] = create_int_feature([0] *
+                                                 len(example["attention_mask"]))
+    features["label_ids"] = create_int_feature([example["label"]])
+    features["is_real_example"] = create_int_feature([1])
+    features["example_id"] = create_int_feature([example["idx"]])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+file_based_convert_examples_to_features(
+    train_dataset,
+    os.path.join(save_path,
+                 f"{pretrained_lm.replace('/', '_')}_train.tf_record"))
+file_based_convert_examples_to_features(
+    eval_dataset,
+    os.path.join(save_path,
+                 f"{pretrained_lm.replace('/', '_')}_eval.tf_record"))
--- a/official/projects/movinet/tools/export_saved_model.py
+++ b/official/projects/movinet/tools/export_saved_model.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 r"""Exports models to tf.saved_model.
 Export example:

--- a/official/projects/movinet/train.py
+++ b/official/projects/movinet/train.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 r"""Training driver.
 To train:

--- a/official/projects/movinet/train_test.py
+++ b/official/projects/movinet/train_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Tests for train.py."""
 import json

--- a/official/projects/pruning/configs/__init__.py
+++ b/official/projects/pruning/configs/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Configs package definition."""
 from official.projects.pruning.configs import image_classification
--- a/official/projects/pruning/configs/image_classification.py
+++ b/official/projects/pruning/configs/image_classification.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Image classification configuration definition."""
 import dataclasses

--- a/official/projects/pruning/configs/image_classification_test.py
+++ b/official/projects/pruning/configs/image_classification_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Tests for image_classification."""
 # pylint: disable=unused-import
 from absl.testing import parameterized

--- a/official/projects/pruning/tasks/__init__.py
+++ b/official/projects/pruning/tasks/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Modeling package definition."""
 from official.projects.pruning.tasks import image_classification
--- a/official/projects/pruning/tasks/image_classification.py
+++ b/official/projects/pruning/tasks/image_classification.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Image classification task definition."""
 from absl import logging
 import tensorflow as tf

--- a/official/projects/pruning/tasks/image_classification_test.py
+++ b/official/projects/pruning/tasks/image_classification_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Tests for image classification task."""
 # pylint: disable=unused-import

--- a/official/projects/qat/vision/modeling/__init__.py
+++ b/official/projects/qat/vision/modeling/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Modeling package definition."""
 from official.projects.qat.vision.modeling import layers
--- a/official/projects/qat/vision/modeling/layers/__init__.py
+++ b/official/projects/qat/vision/modeling/layers/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Layers package definition."""
 from official.projects.qat.vision.modeling.layers.nn_blocks import BottleneckBlockQuantized

--- a/official/projects/qat/vision/modeling/layers/nn_blocks_test.py
+++ b/official/projects/qat/vision/modeling/layers/nn_blocks_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Tests for nn_blocks."""
 from typing import Any, Iterable, Tuple

--- a/official/projects/qat/vision/n_bit/__init__.py
+++ b/official/projects/qat/vision/n_bit/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Configs package definition."""
 from official.projects.qat.vision.n_bit import configs

--- a/official/projects/qat/vision/n_bit/nn_blocks_test.py
+++ b/official/projects/qat/vision/n_bit/nn_blocks_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
 """Tests for nn_blocks."""
 from typing import Any, Iterable, Tuple