Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

cf80ed4e · anivegesana · 394cefcc · 461b3587 · cf80ed4e · cf80ed4e
Commit cf80ed4e authored Aug 02, 2021 by anivegesana
20 changed files
--- a/official/nlp/modeling/models/dual_encoder_test.py
+++ b/official/nlp/modeling/models/dual_encoder_test.py
@@ -37,7 +37,6 @@ class DualEncoderTest(keras_parameterized.TestCase):
        vocab_size=vocab_size,
        num_layers=2,
        hidden_size=hidden_size,
-        sequence_length=sequence_length,
        dict_outputs=True)

    # Create a dual encoder model with the created network.
@@ -72,11 +71,9 @@ class DualEncoderTest(keras_parameterized.TestCase):
  @parameterized.parameters((192, 'logits'), (768, 'predictions'))
  def test_dual_encoder_tensor_call(self, hidden_size, output):
    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the dual encoder model. (Here,
-    # we use # a short sequence_length for convenience.)
+    # Build a transformer network to use within the dual encoder model.
    sequence_length = 2
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=sequence_length)
+    test_network = networks.BertEncoder(vocab_size=100, num_layers=2)

    # Create a dual encoder model with the created network.
    dual_encoder_model = dual_encoder.DualEncoder(
@@ -98,18 +95,16 @@ class DualEncoderTest(keras_parameterized.TestCase):

  def test_serialize_deserialize(self):
    """Validate that the dual encoder model can be serialized / deserialized."""
-    # Build a transformer network to use within the dual encoder model. (Here,
-    # we use a short sequence_length for convenience.)
+    # Build a transformer network to use within the dual encoder model.
    sequence_length = 32
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=sequence_length)
+    test_network = networks.BertEncoder(vocab_size=100, num_layers=2)

    # Create a dual encoder model with the created network. (Note that all the
    # args are different, so we can catch any serialization mismatches.)
    dual_encoder_model = dual_encoder.DualEncoder(
        test_network, max_seq_length=sequence_length, output='predictions')

-    # Create another dual encoder model via serialization and deserialization.
+    # Create another dual encoder moel via serialization and deserialization.
    config = dual_encoder_model.get_config()
    new_dual_encoder = dual_encoder.DualEncoder.from_config(config)


--- a/official/nlp/modeling/models/electra_pretrainer_test.py
+++ b/official/nlp/modeling/models/electra_pretrainer_test.py
@@ -100,7 +100,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
        discriminator_network=test_discriminator_network,
        vocab_size=100,
        num_classes=2,
-        sequence_length=3,
        num_token_predictions=2)

    # Create a set of 2-dimensional data tensors to feed into the model.
@@ -138,7 +137,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
        discriminator_network=test_discriminator_network,
        vocab_size=100,
        num_classes=2,
-        sequence_length=3,
        num_token_predictions=2)

    # Create another BERT trainer via serialization and deserialization.

--- a/official/nlp/modeling/models/xlnet.py
+++ b/official/nlp/modeling/models/xlnet.py
@@ -171,6 +171,7 @@ class XLNetClassifier(tf.keras.Model):
      Defaults to a RandomNormal initializer.
    summary_type: Method used to summarize a sequence into a compact vector.
    dropout_rate: The dropout probability of the cls head.
+    head_name: Name of the classification head.
  """

  def __init__(
@@ -180,6 +181,7 @@ class XLNetClassifier(tf.keras.Model):
      initializer: tf.keras.initializers.Initializer = 'random_normal',
      summary_type: str = 'last',
      dropout_rate: float = 0.1,
+      head_name: str = 'sentence_prediction',
      **kwargs):
    super().__init__(**kwargs)
    self._network = network
@@ -192,6 +194,7 @@ class XLNetClassifier(tf.keras.Model):
        'num_classes': num_classes,
        'summary_type': summary_type,
        'dropout_rate': dropout_rate,
+        'head_name': head_name,
    }

    if summary_type == 'last':
@@ -207,7 +210,7 @@ class XLNetClassifier(tf.keras.Model):
        initializer=initializer,
        dropout_rate=dropout_rate,
        cls_token_idx=cls_token_idx,
-        name='sentence_prediction')
+        name=head_name)

  def call(self, inputs: Mapping[str, Any]):
    input_ids = inputs['input_word_ids']

--- a/official/nlp/modeling/networks/bert_encoder.py
+++ b/official/nlp/modeling/networks/bert_encoder.py
@@ -15,6 +15,8 @@
 """Transformer-based text encoder network."""
 # pylint: disable=g-classes-have-attributes
 import collections
+
+from absl import logging
 import tensorflow as tf

 from official.modeling import activations
@@ -47,8 +49,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
    num_layers: The number of transformer layers.
    num_attention_heads: The number of attention heads for each transformer. The
      hidden size must be divisible by the number of attention heads.
-    sequence_length: [Deprecated]. TODO(hongkuny): remove this argument once no
-      user is using it.
    max_sequence_length: The maximum sequence length that this encoder can
      consume. If None, max_sequence_length uses the value from sequence length.
      This determines the variable shape for positional embeddings.
@@ -87,7 +87,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
               hidden_size=768,
               num_layers=12,
               num_attention_heads=12,
-               sequence_length=None,
               max_sequence_length=512,
               type_vocab_size=16,
               intermediate_size=3072,
@@ -126,6 +125,11 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
        embedding_width=embedding_width,
        embedding_layer=embedding_layer,
        norm_first=norm_first)
+    if 'sequence_length' in kwargs:
+      kwargs.pop('sequence_length')
+      logging.warning('`sequence_length` is a deprecated argument to '
+                      '`BertEncoder`, which has no effect for a while. Please '
+                      'remove `sequence_length` argument.')

    self._embedding_layer_instance = embedding_layer


--- a/official/nlp/nhnet/models.py
+++ b/official/nlp/nhnet/models.py
@@ -458,7 +458,6 @@ def get_nhnet_layers(params: configs.NHNetConfig):
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      sequence_length=None,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(

--- a/official/recommendation/data_pipeline.py
+++ b/official/recommendation/data_pipeline.py
@@ -29,17 +29,16 @@ import timeit
 import traceback
 import typing

+from absl import logging
 import numpy as np
-import six
 from six.moves import queue
 import tensorflow as tf
-from absl import logging

+from tensorflow.python.tpu.datasets import StreamingFilesDataset
 from official.recommendation import constants as rconst
 from official.recommendation import movielens
 from official.recommendation import popen_helper
 from official.recommendation import stat_utils
-from tensorflow.python.tpu.datasets import StreamingFilesDataset

 SUMMARY_TEMPLATE = """General:
 {spacer}Num users: {num_users}
@@ -119,6 +118,7 @@ class DatasetManager(object):
    """Convert NumPy arrays into a TFRecords entry."""

    def create_int_feature(values):
+      values = np.squeeze(values)
      return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))

    feature_dict = {

--- a/official/recommendation/data_preprocessing.py
+++ b/official/recommendation/data_preprocessing.py
@@ -23,21 +23,19 @@ import os
 import pickle
 import time
 import timeit
-
-# pylint: disable=wrong-import-order
+import typing
+from typing import Dict, Text, Tuple

 from absl import logging
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-import typing
-from typing import Dict, Text, Tuple
-# pylint: enable=wrong-import-order

 from official.recommendation import constants as rconst
 from official.recommendation import data_pipeline
 from official.recommendation import movielens

+
 _EXPECTED_CACHE_KEYS = (rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY,
                        rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY,
                        rconst.USER_MAP, rconst.ITEM_MAP)
@@ -196,7 +194,7 @@ def _filter_index_sort(raw_rating_path: Text,

    logging.info("Writing raw data cache.")
    with tf.io.gfile.GFile(cache_path, "wb") as f:
-      pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+      pickle.dump(data, f, protocol=4)

  # TODO(robieta): MLPerf cache clear.
  return data, valid_cache

--- a/official/vision/beta/MODEL_GARDEN.md
+++ b/official/vision/beta/MODEL_GARDEN.md
-# TF Vision Model Garden
+# TF-Vision Model Garden

 ## Introduction
-TF Vision model garden provides a large collection of baselines and checkpoints for image classification, object detection, and instance segmentation.

+TF-Vision modeling library for computer vision provides a collection of
+baselines and checkpoints for image classification, object detection, and
+segmentation.

 ## Image Classification
+
 ### ImageNet Baselines
-#### ResNet models trained with vanilla settings:
-* Models are trained from scratch with batch size 4096 and 1.6 initial learning rate.
+
+#### ResNet models trained with vanilla settings
+
+* Models are trained from scratch with batch size 4096 and 1.6 initial learning
+  rate.
 * Linear warmup is applied for the first 5 epochs.
 * Models trained with l2 weight regularization and ReLU activation.

-| model        | resolution    | epochs  |  Top-1  |  Top-5  | download |
-| ------------ |:-------------:|--------:|--------:|---------:|---------:|
+| Model        | Resolution    | Epochs  |  Top-1  |  Top-5  | Download |
+| ------------ |:-------------:|--------:|--------:|--------:|---------:|
 | ResNet-50    | 224x224       |    90    | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
 | ResNet-50    | 224x224       |    200   | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
 | ResNet-101   | 224x224       |    200   | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) |
 | ResNet-152   | 224x224       |    200   | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) |

-
-#### ResNet-RS models trained with settings including:
-We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image classification models with features:
-
-*   ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
-    adopts ReLU activation in the paper.)
-*   Regularization methods including Random Augment, 4e-5 weight decay, stochastic depth, label smoothing and dropout.
-*   New training methods including a 350-epoch schedule, cosine learning rate and
-    EMA.
-*   Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
-
-model     | resolution | params (M) | Top-1 | Top-5 | download
--------- | :--------: | -----: | ----: | ----: | -------:
-ResNet-RS-50 | 160x160    | 35.7    | 79.1  | 94.5  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) |
-ResNet-RS-101 | 160x160    | 63.7    | 80.2  | 94.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) |
-ResNet-RS-101 | 192x192    | 63.7    | 81.3  | 95.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) |
-ResNet-RS-152 | 192x192    | 86.8    | 81.9  | 95.8  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) |
-ResNet-RS-152 | 224x224    | 86.8    | 82.5  | 96.1  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) |
-ResNet-RS-152 | 256x256    | 86.8    | 83.1  | 96.3  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) |
-ResNet-RS-200 | 256x256    | 93.4    | 83.5  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) |
-ResNet-RS-270 | 256x256    | 130.1    | 83.6  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) |
-ResNet-RS-350 | 256x256    |  164.3   | 83.7  | 96.7  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) |
-ResNet-RS-350 | 320x320    | 164.3   | 84.2  | 96.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) |
-
+#### ResNet-RS models trained with various settings
+
+We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image
+classification models with features:
+
+* ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
+  adopts ReLU activation in the paper.)
+* Regularization methods including Random Augment, 4e-5 weight decay, stochastic
+depth, label smoothing and dropout.
+* New training methods including a 350-epoch schedule, cosine learning rate and
+  EMA.
+* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
+
+| Model     | Resolution | Params (M) | Top-1 | Top-5 | Download |
+| --------- | :--------: | ---------: | ----: | ----: | --------:|
+| ResNet-RS-50 | 160x160    | 35.7    | 79.1  | 94.5  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) |
+| ResNet-RS-101 | 160x160    | 63.7    | 80.2  | 94.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) |
+| ResNet-RS-101 | 192x192    | 63.7    | 81.3  | 95.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) |
+| ResNet-RS-152 | 192x192    | 86.8    | 81.9  | 95.8  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) |
+| ResNet-RS-152 | 224x224    | 86.8    | 82.5  | 96.1  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) |
+| ResNet-RS-152 | 256x256    | 86.8    | 83.1  | 96.3  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) |
+| ResNet-RS-200 | 256x256    | 93.4    | 83.5  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) |
+| ResNet-RS-270 | 256x256    | 130.1    | 83.6  | 96.6  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) |
+| ResNet-RS-350 | 256x256    |  164.3   | 83.7  | 96.7  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) |
+| ResNet-RS-350 | 320x320    | 164.3   | 84.2  | 96.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) |

 ## Object Detection and Instance Segmentation
+
 ### Common Settings and Notes
-* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002) or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144) or [SpineNet](https://arxiv.org/abs/1912.05027).
+
+* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002)
+  or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144)
+  or [SpineNet](https://arxiv.org/abs/1912.05027).
 * Models are all trained on COCO train2017 and evaluated on COCO val2017.
 * Training details:
-  * Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36 epochs schedule. Models trained from scratch adopt the 350 epochs schedule.
-  * The default training data augmentation implements horizontal flipping and scale jittering with a random scale between [0.5, 2.0].
-  * Unless noted, all models are trained with l2 weight regularization and ReLU activation.
-  * We use batch size 256 and stepwise learning rate that decays at the last 30 and 10 epoch.
-  * We use square image as input by resizing the long side of an image to the target size then padding the short side with zeros.
+  * Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36
+    epochs schedule. Models trained from scratch adopt the 350 epochs schedule.
+  * The default training data augmentation implements horizontal flipping and
+    scale jittering with a random scale between [0.5, 2.0].
+  * Unless noted, all models are trained with l2 weight regularization and ReLU
+    activation.
+  * We use batch size 256 and stepwise learning rate that decays at the last 30
+    and 10 epoch.
+  * We use square image as input by resizing the long side of an image to the
+    target size then padding the short side with zeros.

 ### COCO Object Detection Baselines
+
 #### RetinaNet (ImageNet pretrained)
-| backbone        | resolution    | epochs  | FLOPs (B)     | params (M) |  box AP |   download |
-| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:|
-| R50-FPN      | 640x640       |    12    | 97.0 | 34.0 | 34.3 | config|
-| R50-FPN      | 640x640       |    36    | 97.0 | 34.0 | 37.3 | config|
+
+| Backbone     | Resolution    | Epochs  | FLOPs (B)     | Params (M) | Box AP | Download |
+| ------------ |:-------------:| -------:|--------------:|-----------:|-------:|---------:|
+| R50-FPN      | 640x640       |    12   | 97.0 | 34.0 | 34.3 | config|
+| R50-FPN      | 640x640       |    72   | 97.0 | 34.0 | 36.8 | config \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/retinanet-resnet50fpn.tar.gz) |

 #### RetinaNet (Trained from scratch) with training features including:
+
 * Stochastic depth with drop rate 0.2.
 * Swish activation.

-| backbone        | resolution    | epochs  | FLOPs (B)     | params (M) |  box AP |   download |
-| ------------ |:-------------:| ---------:|-----------:|--------:|---------:|-----------:|
+| Backbone     | Resolution    | Epochs  | FLOPs (B)     | Params (M) |  Box AP | Download |
+| ------------ |:-------------:| -------:|--------------:|-----------:|--------:|---------:|
 | SpineNet-49  | 640x640       |    500    | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
 | SpineNet-96  | 1024x1024     |    500    | 265.4 | 43.0 | 48.5 |  [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
 | SpineNet-143 | 1280x1280     |    500    | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|

 #### Mobile-size RetinaNet (Trained from scratch):

-backbone     | resolution | epochs | FLOPs (B) | params (M) | box AP | download
------------ | :--------: | -----: | --------: | ---------: | -----: | -------:
-Mobile SpineNet-49  | 384x384    | 600    | 1.0      | 2.32       | 28.1   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) |
-
+| Backbone    | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
+| ----------- | :--------: | -----: | --------: | ---------: | -----: | --------:|
+| MobileNetv2 | 256x256    | 600    | -         | 2.27       | 23.5   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) |
+| Mobile SpineNet-49  | 384x384    | 600    | 1.0      | 2.32       | 28.1   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) |

 ### Instance Segmentation Baselines
-#### Mask R-CNN (ImageNet pretrained)

+#### Mask R-CNN (ImageNet pretrained)

 #### Mask R-CNN (Trained from scratch)
-| backbone        | resolution    | epochs  | FLOPs (B)  | params (M)  |  box AP |  mask AP  |   download |
-| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:|-----------:|
-| SpineNet-49  | 640x640       |    350    | 215.7 | 40.8 | 42.6 | 37.9 | config |

+| Backbone     | Resolution    | Epochs  | FLOPs (B)  | Params (M) | Box AP | Mask AP | Download |
+| ------------ |:-------------:| -------:|-----------:|-----------:|-------:|--------:|---------:|
+| SpineNet-49  | 640x640       |  350    | 215.7      | 40.8       | 42.6   | 37.9    | config   |
+
+## Semantic Segmentation
+
+* We support [DeepLabV3](https://arxiv.org/pdf/1706.05587.pdf) and
+  [DeepLabV3+](https://arxiv.org/pdf/1802.02611.pdf) architectures, with
+  Dilated ResNet backbones.
+* Backbones are pre-trained on ImageNet.
+
+### PASCAL-VOC
+
+| Model      | Backbone           | Resolution | Steps | mIoU | Download |
+| ---------- | :----------------: | :--------: | ----: | ---: | --------:|
+| DeepLabV3  | Dilated Resnet-101 | 512x512    | 30k   | 78.7 |          |
+| DeepLabV3+ | Dilated Resnet-101 | 512x512    | 30k   | 79.2 |          |
+
+### CITYSCAPES
+
+| Model      | Backbone           | Resolution | Steps | mIoU  | Download |
+| ---------- | :----------------: | :--------: | ----: | ----: | --------:|
+| DeepLabV3+ | Dilated Resnet-101 | 1024x2048  | 90k   | 78.79 |          |

 ## Video Classification
+
 ### Common Settings and Notes
-* We provide models for video classification with two backbones: [SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
+
+* We provide models for video classification with two backbones: 
+  [SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in
+  [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
 * Training and evaluation details:
-  * All models are trained from scratch with vision modality (RGB) for 200 epochs.
-  * We use batch size of 1024 and cosine learning rate decay with linear warmup in first 5 epochs.
-  * We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view evaluation.
+  * All models are trained from scratch with vision modality (RGB) for 200
+    epochs.
+  * We use batch size of 1024 and cosine learning rate decay with linear warmup
+    in first 5 epochs.
+  * We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view
+    evaluation.

 ### Kinetics-400 Action Recognition Baselines
-| model    | input (frame x stride) |  Top-1  |  Top-5  | download |
+
+| Model    | Input (frame x stride) |  Top-1  |  Top-5  | Download |
 | -------- |:----------------------:|--------:|--------:|---------:|
 | SlowOnly | 8 x 8                  |  74.1   |  91.4   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) |
 | SlowOnly | 16 x 4                 |  75.6   |  92.1   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) |
 | R3D-50   | 32 x 2                 |  77.0   |  93.0   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) |

 ### Kinetics-600 Action Recognition Baselines
-| model    | input (frame x stride) |  Top-1  |  Top-5  | download |
+
+| Model    | Input (frame x stride) |  Top-1  |  Top-5  | Download |
 | -------- |:----------------------:|--------:|--------:|---------:|
 | SlowOnly | 8 x 8                  |  77.3   |  93.6   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) |
 | R3D-50   | 32 x 2                 |  79.5   |  94.8   | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) |
--- a/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml
+++ b/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml
+# --experiment_type=retinanet_mobile_coco
+# COCO AP 23.5%
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  losses:
+    l2_weight_decay: 3.0e-05
+  model:
+    anchor:
+      anchor_size: 3
+      aspect_ratios: [0.5, 1.0, 2.0]
+      num_scales: 3
+    backbone:
+      mobilenet:
+        model_id: 'MobileNetV2'
+        filter_size_scale: 1.0
+      type: 'mobilenet'
+    decoder:
+      type: 'fpn'
+      fpn:
+        num_filters: 128
+        use_separable_conv: true
+    head:
+      num_convs: 4
+      num_filters: 128
+      use_separable_conv: true
+    input_size: [256, 256, 3]
+    max_level: 7
+    min_level: 3
+    norm_activation:
+      activation: 'relu6'
+      norm_epsilon: 0.001
+      norm_momentum: 0.99
+      use_sync_bn: true
+  train_data:
+    dtype: 'bfloat16'
+    global_batch_size: 256
+    is_training: true
+    parser:
+      aug_rand_hflip: true
+      aug_scale_max: 2.0
+      aug_scale_min: 0.5
+  validation_data:
+    dtype: 'bfloat16'
+    global_batch_size: 8
+    is_training: false
+trainer:
+  optimizer_config:
+    learning_rate:
+      stepwise:
+        boundaries: [263340, 272580]
+        values: [0.32, 0.032, 0.0032]
+      type: 'stepwise'
+    warmup:
+      linear:
+        warmup_learning_rate: 0.0067
+        warmup_steps: 2000
+  steps_per_loop: 462
+  train_steps: 277200
+  validation_interval: 462
+  validation_steps: 625
--- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml
+++ b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml
-# --experiment_type=retinanet_spinenet_mobile_coco
+# --experiment_type=retinanet_mobile_coco
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -47,7 +47,7 @@ trainer:
  optimizer_config:
    learning_rate:
      stepwise:
-        boundaries: [265650, 272580]
+        boundaries: [263340, 272580]
        values: [0.32, 0.032, 0.0032]
      type: 'stepwise'
    warmup:

--- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml
+++ b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml
-# --experiment_type=retinanet_spinenet_mobile_coco
+# --experiment_type=retinanet_mobile_coco
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -47,7 +47,7 @@ trainer:
  optimizer_config:
    learning_rate:
      stepwise:
-        boundaries: [265650, 272580]
+        boundaries: [263340, 272580]
        values: [0.32, 0.032, 0.0032]
      type: 'stepwise'
    warmup:

--- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml
+++ b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml
-# --experiment_type=retinanet_spinenet_mobile_coco
+# --experiment_type=retinanet_mobile_coco
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -47,7 +47,7 @@ trainer:
  optimizer_config:
    learning_rate:
      stepwise:
-        boundaries: [265650, 272580]
+        boundaries: [263340, 272580]
        values: [0.32, 0.032, 0.0032]
      type: 'stepwise'
    warmup:

--- a/official/vision/beta/configs/maskrcnn.py
+++ b/official/vision/beta/configs/maskrcnn.py
@@ -15,11 +15,10 @@
 # Lint as: python3
 """Mask R-CNN configuration definition."""

+import dataclasses
 import os
 from typing import List, Optional

-import dataclasses
-
 from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import hyperparams
@@ -79,6 +78,8 @@ class DataConfig(cfg.DataConfig):
  shuffle_buffer_size: int = 10000
  file_type: str = 'tfrecord'
  drop_remainder: bool = True
+  # Number of examples in the data set, it's used to create the annotation file.
+  num_examples: int = -1


 @dataclasses.dataclass

--- a/official/vision/beta/configs/retinanet.py
+++ b/official/vision/beta/configs/retinanet.py
@@ -326,9 +326,9 @@ def retinanet_spinenet_coco() -> cfg.ExperimentConfig:
  return config


-@exp_factory.register_config_factory('retinanet_spinenet_mobile_coco')
+@exp_factory.register_config_factory('retinanet_mobile_coco')
 def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
-  """COCO object detection with RetinaNet using Mobile SpineNet backbone."""
+  """COCO object detection with mobile RetinaNet."""
  train_batch_size = 256
  eval_batch_size = 8
  steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
@@ -407,8 +407,6 @@ def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None',
-          'task.model.min_level == task.model.backbone.spinenet_mobile.min_level',
-          'task.model.max_level == task.model.backbone.spinenet_mobile.max_level',
      ])

  return config
--- a/official/vision/beta/configs/retinanet_test.py
+++ b/official/vision/beta/configs/retinanet_test.py
@@ -28,7 +28,7 @@ class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.parameters(
      ('retinanet_resnetfpn_coco',),
      ('retinanet_spinenet_coco',),
-      ('retinanet_spinenet_mobile_coco',),
+      ('retinanet_mobile_coco',),
  )
  def test_retinanet_configs(self, config_name):
    config = exp_factory.get_exp_config(config_name)

--- a/official/vision/beta/evaluation/coco_utils.py
+++ b/official/vision/beta/evaluation/coco_utils.py
@@ -18,6 +18,7 @@ import copy
 import json

 # Import libraries
+
 from absl import logging
 import numpy as np
 from PIL import Image
@@ -26,6 +27,7 @@ from pycocotools import mask as mask_api
 import six
 import tensorflow as tf

+from official.common import dataset_fn
 from official.vision.beta.dataloaders import tf_example_decoder
 from official.vision.beta.ops import box_ops
 from official.vision.beta.ops import mask_ops
@@ -240,10 +242,20 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
              (boxes[j, k, 3] - boxes[j, k, 1]) *
              (boxes[j, k, 2] - boxes[j, k, 0]))
        if 'masks' in groundtruths:
-          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
-          width, height = mask.size
-          np_mask = (
-              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
+          if isinstance(groundtruths['masks'][i][j, k], tf.Tensor):
+            mask = Image.open(
+                six.BytesIO(groundtruths['masks'][i][j, k].numpy()))
+            width, height = mask.size
+            np_mask = (
+                np.array(mask.getdata()).reshape(height,
+                                                 width).astype(np.uint8))
+          else:
+            mask = Image.open(
+                six.BytesIO(groundtruths['masks'][i][j, k]))
+            width, height = mask.size
+            np_mask = (
+                np.array(mask.getdata()).reshape(height,
+                                                 width).astype(np.uint8))
          np_mask[np_mask > 0] = 255
          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
          ann['segmentation'] = encoded_mask
@@ -271,11 +283,11 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
 class COCOGroundtruthGenerator:
  """Generates the groundtruth annotations from a single example."""

-  def __init__(self, file_pattern, num_examples, include_mask):
+  def __init__(self, file_pattern, file_type, num_examples, include_mask):
    self._file_pattern = file_pattern
    self._num_examples = num_examples
    self._include_mask = include_mask
-    self._dataset_fn = tf.data.TFRecordDataset
+    self._dataset_fn = dataset_fn.pick_dataset_fn(file_type)

  def _parse_single_example(self, example):
    """Parses a single serialized tf.Example proto.
@@ -308,7 +320,7 @@ class COCOGroundtruthGenerator:
    boxes = box_ops.denormalize_boxes(
        decoded_tensors['groundtruth_boxes'], image_size)
    groundtruths = {
-        'source_id': tf.string_to_number(
+        'source_id': tf.strings.to_number(
            decoded_tensors['source_id'], out_type=tf.int64),
        'height': decoded_tensors['height'],
        'width': decoded_tensors['width'],
@@ -344,12 +356,13 @@ class COCOGroundtruthGenerator:


 def scan_and_generator_annotation_file(file_pattern: str,
+                                       file_type: str,
                                       num_samples: int,
                                       include_mask: bool,
                                       annotation_file: str):
  """Scans and generate the COCO-style annotation JSON file given a dataset."""
  groundtruth_generator = COCOGroundtruthGenerator(
-      file_pattern, num_samples, include_mask)
+      file_pattern, file_type, num_samples, include_mask)
  generate_annotation_file(groundtruth_generator, annotation_file)



--- a/official/vision/beta/modeling/backbones/mobilenet.py
+++ b/official/vision/beta/modeling/backbones/mobilenet.py
@@ -592,8 +592,9 @@ class MobileNet(tf.keras.Model):

    x, endpoints, next_endpoint_level = self._mobilenet_base(inputs=inputs)

-    endpoints[str(next_endpoint_level)] = x
    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
+    # Don't include the final layer in `self._output_specs` to support decoders.
+    endpoints[str(next_endpoint_level)] = x

    super(MobileNet, self).__init__(
        inputs=inputs, outputs=endpoints, **kwargs)

--- a/official/vision/beta/modeling/backbones/spinenet.py
+++ b/official/vision/beta/modeling/backbones/spinenet.py
@@ -130,7 +130,7 @@ class SpineNet(tf.keras.Model):
  def __init__(
      self,
      input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
-          shape=[None, 640, 640, 3]),
+          shape=[None, None, None, 3]),
      min_level: int = 3,
      max_level: int = 7,
      block_specs: List[BlockSpec] = build_block_specs(),
@@ -214,8 +214,11 @@ class SpineNet(tf.keras.Model):
    inputs = tf.keras.Input(shape=input_specs.shape[1:])

    net = self._build_stem(inputs=inputs)
-    net = self._build_scale_permuted_network(
-        net=net, input_width=input_specs.shape[2])
+    input_width = input_specs.shape[2]
+    if input_width is None:
+      max_stride = max(map(lambda b: b.level, block_specs))
+      input_width = 2 ** max_stride
+    net = self._build_scale_permuted_network(net=net, input_width=input_width)
    endpoints = self._build_endpoints(net=net)

    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}

--- a/official/vision/beta/modeling/backbones/spinenet_mobile.py
+++ b/official/vision/beta/modeling/backbones/spinenet_mobile.py
@@ -135,7 +135,7 @@ class SpineNetMobile(tf.keras.Model):
  def __init__(
      self,
      input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
-          shape=[None, 512, 512, 3]),
+          shape=[None, None, None, 3]),
      min_level: int = 3,
      max_level: int = 7,
      block_specs: List[BlockSpec] = build_block_specs(),
@@ -219,8 +219,11 @@ class SpineNetMobile(tf.keras.Model):
    inputs = tf.keras.Input(shape=input_specs.shape[1:])

    net = self._build_stem(inputs=inputs)
-    net = self._build_scale_permuted_network(
-        net=net, input_width=input_specs.shape[2])
+    input_width = input_specs.shape[2]
+    if input_width is None:
+      max_stride = max(map(lambda b: b.level, block_specs))
+      input_width = 2 ** max_stride
+    net = self._build_scale_permuted_network(net=net, input_width=input_width)
    endpoints = self._build_endpoints(net=net)

    self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}

--- a/official/vision/beta/modeling/decoders/aspp.py
+++ b/official/vision/beta/modeling/decoders/aspp.py
@@ -13,12 +13,15 @@
 # limitations under the License.

 """Contains definitions of Atrous Spatial Pyramid Pooling (ASPP) decoder."""
-from typing import Any, List, Optional, Mapping
+from typing import Any, List, Mapping, Optional

 # Import libraries
+
 import tensorflow as tf

+from official.modeling import hyperparams
 from official.vision import keras_cv
+from official.vision.beta.modeling.decoders import factory


 @tf.keras.utils.register_keras_serializable(package='Vision')
@@ -128,3 +131,46 @@ class ASPP(tf.keras.layers.Layer):
  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
+
+
+@factory.register_decoder_builder('aspp')
+def build_aspp_decoder(
+    input_specs: Mapping[str, tf.TensorShape],
+    model_config: hyperparams.Config,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds ASPP decoder from a config.
+
+  Args:
+    input_specs: A `dict` of input specifications. A dictionary consists of
+      {level: TensorShape} from a backbone. Note this is for consistent
+        interface, and is not used by ASPP decoder.
+    model_config: A OneOfConfig. Model config.
+    l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
+      None.
+
+  Returns:
+    A `tf.keras.Model` instance of the ASPP decoder.
+
+  Raises:
+    ValueError: If the model_config.decoder.type is not `aspp`.
+  """
+  del input_specs  # input_specs is not used by ASPP decoder.
+  decoder_type = model_config.decoder.type
+  decoder_cfg = model_config.decoder.get()
+  if decoder_type != 'aspp':
+    raise ValueError(f'Inconsistent decoder type {decoder_type}. '
+                     'Need to be `aspp`.')
+
+  norm_activation_config = model_config.norm_activation
+  return ASPP(
+      level=decoder_cfg.level,
+      dilation_rates=decoder_cfg.dilation_rates,
+      num_filters=decoder_cfg.num_filters,
+      pool_kernel_size=decoder_cfg.pool_kernel_size,
+      dropout_rate=decoder_cfg.dropout_rate,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      activation=norm_activation_config.activation,
+      kernel_regularizer=l2_regularizer)