"tutorials/models/vscode:/vscode.git/clone" did not exist on "870857cfba32a4a81111d16bd0212c076b883a37"
Commit cf80ed4e authored by anivegesana's avatar anivegesana
Browse files

Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

parents 394cefcc 461b3587
...@@ -37,7 +37,6 @@ class DualEncoderTest(keras_parameterized.TestCase): ...@@ -37,7 +37,6 @@ class DualEncoderTest(keras_parameterized.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=2,
hidden_size=hidden_size, hidden_size=hidden_size,
sequence_length=sequence_length,
dict_outputs=True) dict_outputs=True)
# Create a dual encoder model with the created network. # Create a dual encoder model with the created network.
...@@ -72,11 +71,9 @@ class DualEncoderTest(keras_parameterized.TestCase): ...@@ -72,11 +71,9 @@ class DualEncoderTest(keras_parameterized.TestCase):
@parameterized.parameters((192, 'logits'), (768, 'predictions')) @parameterized.parameters((192, 'logits'), (768, 'predictions'))
def test_dual_encoder_tensor_call(self, hidden_size, output): def test_dual_encoder_tensor_call(self, hidden_size, output):
"""Validate that the Keras object can be invoked.""" """Validate that the Keras object can be invoked."""
# Build a transformer network to use within the dual encoder model. (Here, # Build a transformer network to use within the dual encoder model.
# we use # a short sequence_length for convenience.)
sequence_length = 2 sequence_length = 2
test_network = networks.BertEncoder( test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
vocab_size=100, num_layers=2, sequence_length=sequence_length)
# Create a dual encoder model with the created network. # Create a dual encoder model with the created network.
dual_encoder_model = dual_encoder.DualEncoder( dual_encoder_model = dual_encoder.DualEncoder(
...@@ -98,18 +95,16 @@ class DualEncoderTest(keras_parameterized.TestCase): ...@@ -98,18 +95,16 @@ class DualEncoderTest(keras_parameterized.TestCase):
def test_serialize_deserialize(self): def test_serialize_deserialize(self):
"""Validate that the dual encoder model can be serialized / deserialized.""" """Validate that the dual encoder model can be serialized / deserialized."""
# Build a transformer network to use within the dual encoder model. (Here, # Build a transformer network to use within the dual encoder model.
# we use a short sequence_length for convenience.)
sequence_length = 32 sequence_length = 32
test_network = networks.BertEncoder( test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
vocab_size=100, num_layers=2, sequence_length=sequence_length)
# Create a dual encoder model with the created network. (Note that all the # Create a dual encoder model with the created network. (Note that all the
# args are different, so we can catch any serialization mismatches.) # args are different, so we can catch any serialization mismatches.)
dual_encoder_model = dual_encoder.DualEncoder( dual_encoder_model = dual_encoder.DualEncoder(
test_network, max_seq_length=sequence_length, output='predictions') test_network, max_seq_length=sequence_length, output='predictions')
# Create another dual encoder model via serialization and deserialization. # Create another dual encoder moel via serialization and deserialization.
config = dual_encoder_model.get_config() config = dual_encoder_model.get_config()
new_dual_encoder = dual_encoder.DualEncoder.from_config(config) new_dual_encoder = dual_encoder.DualEncoder.from_config(config)
......
...@@ -100,7 +100,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase): ...@@ -100,7 +100,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
discriminator_network=test_discriminator_network, discriminator_network=test_discriminator_network,
vocab_size=100, vocab_size=100,
num_classes=2, num_classes=2,
sequence_length=3,
num_token_predictions=2) num_token_predictions=2)
# Create a set of 2-dimensional data tensors to feed into the model. # Create a set of 2-dimensional data tensors to feed into the model.
...@@ -138,7 +137,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase): ...@@ -138,7 +137,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
discriminator_network=test_discriminator_network, discriminator_network=test_discriminator_network,
vocab_size=100, vocab_size=100,
num_classes=2, num_classes=2,
sequence_length=3,
num_token_predictions=2) num_token_predictions=2)
# Create another BERT trainer via serialization and deserialization. # Create another BERT trainer via serialization and deserialization.
......
...@@ -171,6 +171,7 @@ class XLNetClassifier(tf.keras.Model): ...@@ -171,6 +171,7 @@ class XLNetClassifier(tf.keras.Model):
Defaults to a RandomNormal initializer. Defaults to a RandomNormal initializer.
summary_type: Method used to summarize a sequence into a compact vector. summary_type: Method used to summarize a sequence into a compact vector.
dropout_rate: The dropout probability of the cls head. dropout_rate: The dropout probability of the cls head.
head_name: Name of the classification head.
""" """
def __init__( def __init__(
...@@ -180,6 +181,7 @@ class XLNetClassifier(tf.keras.Model): ...@@ -180,6 +181,7 @@ class XLNetClassifier(tf.keras.Model):
initializer: tf.keras.initializers.Initializer = 'random_normal', initializer: tf.keras.initializers.Initializer = 'random_normal',
summary_type: str = 'last', summary_type: str = 'last',
dropout_rate: float = 0.1, dropout_rate: float = 0.1,
head_name: str = 'sentence_prediction',
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._network = network self._network = network
...@@ -192,6 +194,7 @@ class XLNetClassifier(tf.keras.Model): ...@@ -192,6 +194,7 @@ class XLNetClassifier(tf.keras.Model):
'num_classes': num_classes, 'num_classes': num_classes,
'summary_type': summary_type, 'summary_type': summary_type,
'dropout_rate': dropout_rate, 'dropout_rate': dropout_rate,
'head_name': head_name,
} }
if summary_type == 'last': if summary_type == 'last':
...@@ -207,7 +210,7 @@ class XLNetClassifier(tf.keras.Model): ...@@ -207,7 +210,7 @@ class XLNetClassifier(tf.keras.Model):
initializer=initializer, initializer=initializer,
dropout_rate=dropout_rate, dropout_rate=dropout_rate,
cls_token_idx=cls_token_idx, cls_token_idx=cls_token_idx,
name='sentence_prediction') name=head_name)
def call(self, inputs: Mapping[str, Any]): def call(self, inputs: Mapping[str, Any]):
input_ids = inputs['input_word_ids'] input_ids = inputs['input_word_ids']
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
"""Transformer-based text encoder network.""" """Transformer-based text encoder network."""
# pylint: disable=g-classes-have-attributes # pylint: disable=g-classes-have-attributes
import collections import collections
from absl import logging
import tensorflow as tf import tensorflow as tf
from official.modeling import activations from official.modeling import activations
...@@ -47,8 +49,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder): ...@@ -47,8 +49,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
num_layers: The number of transformer layers. num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads. hidden size must be divisible by the number of attention heads.
sequence_length: [Deprecated]. TODO(hongkuny): remove this argument once no
user is using it.
max_sequence_length: The maximum sequence length that this encoder can max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length. consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings. This determines the variable shape for positional embeddings.
...@@ -87,7 +87,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder): ...@@ -87,7 +87,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
hidden_size=768, hidden_size=768,
num_layers=12, num_layers=12,
num_attention_heads=12, num_attention_heads=12,
sequence_length=None,
max_sequence_length=512, max_sequence_length=512,
type_vocab_size=16, type_vocab_size=16,
intermediate_size=3072, intermediate_size=3072,
...@@ -126,6 +125,11 @@ class BertEncoder(keras_nlp.encoders.BertEncoder): ...@@ -126,6 +125,11 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
embedding_width=embedding_width, embedding_width=embedding_width,
embedding_layer=embedding_layer, embedding_layer=embedding_layer,
norm_first=norm_first) norm_first=norm_first)
if 'sequence_length' in kwargs:
kwargs.pop('sequence_length')
logging.warning('`sequence_length` is a deprecated argument to '
'`BertEncoder`, which has no effect for a while. Please '
'remove `sequence_length` argument.')
self._embedding_layer_instance = embedding_layer self._embedding_layer_instance = embedding_layer
......
...@@ -458,7 +458,6 @@ def get_nhnet_layers(params: configs.NHNetConfig): ...@@ -458,7 +458,6 @@ def get_nhnet_layers(params: configs.NHNetConfig):
activation=tf_utils.get_activation(bert_config.hidden_act), activation=tf_utils.get_activation(bert_config.hidden_act),
dropout_rate=bert_config.hidden_dropout_prob, dropout_rate=bert_config.hidden_dropout_prob,
attention_dropout_rate=bert_config.attention_probs_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob,
sequence_length=None,
max_sequence_length=bert_config.max_position_embeddings, max_sequence_length=bert_config.max_position_embeddings,
type_vocab_size=bert_config.type_vocab_size, type_vocab_size=bert_config.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal( initializer=tf.keras.initializers.TruncatedNormal(
......
...@@ -29,17 +29,16 @@ import timeit ...@@ -29,17 +29,16 @@ import timeit
import traceback import traceback
import typing import typing
from absl import logging
import numpy as np import numpy as np
import six
from six.moves import queue from six.moves import queue
import tensorflow as tf import tensorflow as tf
from absl import logging
from tensorflow.python.tpu.datasets import StreamingFilesDataset
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import movielens from official.recommendation import movielens
from official.recommendation import popen_helper from official.recommendation import popen_helper
from official.recommendation import stat_utils from official.recommendation import stat_utils
from tensorflow.python.tpu.datasets import StreamingFilesDataset
SUMMARY_TEMPLATE = """General: SUMMARY_TEMPLATE = """General:
{spacer}Num users: {num_users} {spacer}Num users: {num_users}
...@@ -119,6 +118,7 @@ class DatasetManager(object): ...@@ -119,6 +118,7 @@ class DatasetManager(object):
"""Convert NumPy arrays into a TFRecords entry.""" """Convert NumPy arrays into a TFRecords entry."""
def create_int_feature(values): def create_int_feature(values):
values = np.squeeze(values)
return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
feature_dict = { feature_dict = {
......
...@@ -23,21 +23,19 @@ import os ...@@ -23,21 +23,19 @@ import os
import pickle import pickle
import time import time
import timeit import timeit
import typing
# pylint: disable=wrong-import-order from typing import Dict, Text, Tuple
from absl import logging from absl import logging
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import tensorflow as tf import tensorflow as tf
import typing
from typing import Dict, Text, Tuple
# pylint: enable=wrong-import-order
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import data_pipeline from official.recommendation import data_pipeline
from official.recommendation import movielens from official.recommendation import movielens
_EXPECTED_CACHE_KEYS = (rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY, _EXPECTED_CACHE_KEYS = (rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY,
rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY, rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY,
rconst.USER_MAP, rconst.ITEM_MAP) rconst.USER_MAP, rconst.ITEM_MAP)
...@@ -196,7 +194,7 @@ def _filter_index_sort(raw_rating_path: Text, ...@@ -196,7 +194,7 @@ def _filter_index_sort(raw_rating_path: Text,
logging.info("Writing raw data cache.") logging.info("Writing raw data cache.")
with tf.io.gfile.GFile(cache_path, "wb") as f: with tf.io.gfile.GFile(cache_path, "wb") as f:
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(data, f, protocol=4)
# TODO(robieta): MLPerf cache clear. # TODO(robieta): MLPerf cache clear.
return data, valid_cache return data, valid_cache
......
# TF Vision Model Garden # TF-Vision Model Garden
## Introduction ## Introduction
TF Vision model garden provides a large collection of baselines and checkpoints for image classification, object detection, and instance segmentation.
TF-Vision modeling library for computer vision provides a collection of
baselines and checkpoints for image classification, object detection, and
segmentation.
## Image Classification ## Image Classification
### ImageNet Baselines ### ImageNet Baselines
#### ResNet models trained with vanilla settings:
* Models are trained from scratch with batch size 4096 and 1.6 initial learning rate. #### ResNet models trained with vanilla settings
* Models are trained from scratch with batch size 4096 and 1.6 initial learning
rate.
* Linear warmup is applied for the first 5 epochs. * Linear warmup is applied for the first 5 epochs.
* Models trained with l2 weight regularization and ReLU activation. * Models trained with l2 weight regularization and ReLU activation.
| model | resolution | epochs | Top-1 | Top-5 | download | | Model | Resolution | Epochs | Top-1 | Top-5 | Download |
| ------------ |:-------------:|--------:|--------:|---------:|---------:| | ------------ |:-------------:|--------:|--------:|--------:|---------:|
| ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | | ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
| ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | | ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
| ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) | | ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) |
| ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) | | ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) |
#### ResNet-RS models trained with various settings
#### ResNet-RS models trained with settings including:
We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image classification models with features: We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image
classification models with features:
* ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
adopts ReLU activation in the paper.) * ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
* Regularization methods including Random Augment, 4e-5 weight decay, stochastic depth, label smoothing and dropout. adopts ReLU activation in the paper.)
* New training methods including a 350-epoch schedule, cosine learning rate and * Regularization methods including Random Augment, 4e-5 weight decay, stochastic
EMA. depth, label smoothing and dropout.
* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification). * New training methods including a 350-epoch schedule, cosine learning rate and
EMA.
model | resolution | params (M) | Top-1 | Top-5 | download * Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
--------- | :--------: | -----: | ----: | ----: | -------:
ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) | | Model | Resolution | Params (M) | Top-1 | Top-5 | Download |
ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) | | --------- | :--------: | ---------: | ----: | ----: | --------:|
ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) | | ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) |
ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) | | ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) |
ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) | | ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) |
ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) | | ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) |
ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) | | ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) |
ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) | | ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) |
ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) | | ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) |
ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) | | ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) |
| ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) |
| ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) |
## Object Detection and Instance Segmentation ## Object Detection and Instance Segmentation
### Common Settings and Notes ### Common Settings and Notes
* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002) or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144) or [SpineNet](https://arxiv.org/abs/1912.05027).
* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002)
or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144)
or [SpineNet](https://arxiv.org/abs/1912.05027).
* Models are all trained on COCO train2017 and evaluated on COCO val2017. * Models are all trained on COCO train2017 and evaluated on COCO val2017.
* Training details: * Training details:
* Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36 epochs schedule. Models trained from scratch adopt the 350 epochs schedule. * Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36
* The default training data augmentation implements horizontal flipping and scale jittering with a random scale between [0.5, 2.0]. epochs schedule. Models trained from scratch adopt the 350 epochs schedule.
* Unless noted, all models are trained with l2 weight regularization and ReLU activation. * The default training data augmentation implements horizontal flipping and
* We use batch size 256 and stepwise learning rate that decays at the last 30 and 10 epoch. scale jittering with a random scale between [0.5, 2.0].
* We use square image as input by resizing the long side of an image to the target size then padding the short side with zeros. * Unless noted, all models are trained with l2 weight regularization and ReLU
activation.
* We use batch size 256 and stepwise learning rate that decays at the last 30
and 10 epoch.
* We use square image as input by resizing the long side of an image to the
target size then padding the short side with zeros.
### COCO Object Detection Baselines ### COCO Object Detection Baselines
#### RetinaNet (ImageNet pretrained) #### RetinaNet (ImageNet pretrained)
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download |
| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:| | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
| R50-FPN | 640x640 | 12 | 97.0 | 34.0 | 34.3 | config| | ------------ |:-------------:| -------:|--------------:|-----------:|-------:|---------:|
| R50-FPN | 640x640 | 36 | 97.0 | 34.0 | 37.3 | config| | R50-FPN | 640x640 | 12 | 97.0 | 34.0 | 34.3 | config|
| R50-FPN | 640x640 | 72 | 97.0 | 34.0 | 36.8 | config \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/retinanet-resnet50fpn.tar.gz) |
#### RetinaNet (Trained from scratch) with training features including: #### RetinaNet (Trained from scratch) with training features including:
* Stochastic depth with drop rate 0.2. * Stochastic depth with drop rate 0.2.
* Swish activation. * Swish activation.
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download | | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
| ------------ |:-------------:| ---------:|-----------:|--------:|---------:|-----------:| | ------------ |:-------------:| -------:|--------------:|-----------:|--------:|---------:|
| SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| | SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
| SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| | SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
| SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| | SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
#### Mobile-size RetinaNet (Trained from scratch): #### Mobile-size RetinaNet (Trained from scratch):
backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
------------ | :--------: | -----: | --------: | ---------: | -----: | -------: | ----------- | :--------: | -----: | --------: | ---------: | -----: | --------:|
Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) | | MobileNetv2 | 256x256 | 600 | - | 2.27 | 23.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) |
| Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) |
### Instance Segmentation Baselines ### Instance Segmentation Baselines
#### Mask R-CNN (ImageNet pretrained)
#### Mask R-CNN (ImageNet pretrained)
#### Mask R-CNN (Trained from scratch) #### Mask R-CNN (Trained from scratch)
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | mask AP | download |
| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:|-----------:|
| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | config |
| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Mask AP | Download |
| ------------ |:-------------:| -------:|-----------:|-----------:|-------:|--------:|---------:|
| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | config |
## Semantic Segmentation
* We support [DeepLabV3](https://arxiv.org/pdf/1706.05587.pdf) and
[DeepLabV3+](https://arxiv.org/pdf/1802.02611.pdf) architectures, with
Dilated ResNet backbones.
* Backbones are pre-trained on ImageNet.
### PASCAL-VOC
| Model | Backbone | Resolution | Steps | mIoU | Download |
| ---------- | :----------------: | :--------: | ----: | ---: | --------:|
| DeepLabV3 | Dilated Resnet-101 | 512x512 | 30k | 78.7 | |
| DeepLabV3+ | Dilated Resnet-101 | 512x512 | 30k | 79.2 | |
### CITYSCAPES
| Model | Backbone | Resolution | Steps | mIoU | Download |
| ---------- | :----------------: | :--------: | ----: | ----: | --------:|
| DeepLabV3+ | Dilated Resnet-101 | 1024x2048 | 90k | 78.79 | |
## Video Classification ## Video Classification
### Common Settings and Notes ### Common Settings and Notes
* We provide models for video classification with two backbones: [SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
* We provide models for video classification with two backbones:
[SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in
[Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
* Training and evaluation details: * Training and evaluation details:
* All models are trained from scratch with vision modality (RGB) for 200 epochs. * All models are trained from scratch with vision modality (RGB) for 200
* We use batch size of 1024 and cosine learning rate decay with linear warmup in first 5 epochs. epochs.
* We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view evaluation. * We use batch size of 1024 and cosine learning rate decay with linear warmup
in first 5 epochs.
* We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view
evaluation.
### Kinetics-400 Action Recognition Baselines ### Kinetics-400 Action Recognition Baselines
| model | input (frame x stride) | Top-1 | Top-5 | download |
| Model | Input (frame x stride) | Top-1 | Top-5 | Download |
| -------- |:----------------------:|--------:|--------:|---------:| | -------- |:----------------------:|--------:|--------:|---------:|
| SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) | | SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) |
| SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) | | SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) |
| R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) | | R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) |
### Kinetics-600 Action Recognition Baselines ### Kinetics-600 Action Recognition Baselines
| model | input (frame x stride) | Top-1 | Top-5 | download |
| Model | Input (frame x stride) | Top-1 | Top-5 | Download |
| -------- |:----------------------:|--------:|--------:|---------:| | -------- |:----------------------:|--------:|--------:|---------:|
| SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) | | SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) |
| R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) | | R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) |
# --experiment_type=retinanet_mobile_coco
# COCO AP 23.5%
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
task:
losses:
l2_weight_decay: 3.0e-05
model:
anchor:
anchor_size: 3
aspect_ratios: [0.5, 1.0, 2.0]
num_scales: 3
backbone:
mobilenet:
model_id: 'MobileNetV2'
filter_size_scale: 1.0
type: 'mobilenet'
decoder:
type: 'fpn'
fpn:
num_filters: 128
use_separable_conv: true
head:
num_convs: 4
num_filters: 128
use_separable_conv: true
input_size: [256, 256, 3]
max_level: 7
min_level: 3
norm_activation:
activation: 'relu6'
norm_epsilon: 0.001
norm_momentum: 0.99
use_sync_bn: true
train_data:
dtype: 'bfloat16'
global_batch_size: 256
is_training: true
parser:
aug_rand_hflip: true
aug_scale_max: 2.0
aug_scale_min: 0.5
validation_data:
dtype: 'bfloat16'
global_batch_size: 8
is_training: false
trainer:
optimizer_config:
learning_rate:
stepwise:
boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032]
type: 'stepwise'
warmup:
linear:
warmup_learning_rate: 0.0067
warmup_steps: 2000
steps_per_loop: 462
train_steps: 277200
validation_interval: 462
validation_steps: 625
# --experiment_type=retinanet_spinenet_mobile_coco # --experiment_type=retinanet_mobile_coco
runtime: runtime:
distribution_strategy: 'tpu' distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16' mixed_precision_dtype: 'bfloat16'
...@@ -47,7 +47,7 @@ trainer: ...@@ -47,7 +47,7 @@ trainer:
optimizer_config: optimizer_config:
learning_rate: learning_rate:
stepwise: stepwise:
boundaries: [265650, 272580] boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032] values: [0.32, 0.032, 0.0032]
type: 'stepwise' type: 'stepwise'
warmup: warmup:
......
# --experiment_type=retinanet_spinenet_mobile_coco # --experiment_type=retinanet_mobile_coco
runtime: runtime:
distribution_strategy: 'tpu' distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16' mixed_precision_dtype: 'bfloat16'
...@@ -47,7 +47,7 @@ trainer: ...@@ -47,7 +47,7 @@ trainer:
optimizer_config: optimizer_config:
learning_rate: learning_rate:
stepwise: stepwise:
boundaries: [265650, 272580] boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032] values: [0.32, 0.032, 0.0032]
type: 'stepwise' type: 'stepwise'
warmup: warmup:
......
# --experiment_type=retinanet_spinenet_mobile_coco # --experiment_type=retinanet_mobile_coco
runtime: runtime:
distribution_strategy: 'tpu' distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16' mixed_precision_dtype: 'bfloat16'
...@@ -47,7 +47,7 @@ trainer: ...@@ -47,7 +47,7 @@ trainer:
optimizer_config: optimizer_config:
learning_rate: learning_rate:
stepwise: stepwise:
boundaries: [265650, 272580] boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032] values: [0.32, 0.032, 0.0032]
type: 'stepwise' type: 'stepwise'
warmup: warmup:
......
...@@ -15,11 +15,10 @@ ...@@ -15,11 +15,10 @@
# Lint as: python3 # Lint as: python3
"""Mask R-CNN configuration definition.""" """Mask R-CNN configuration definition."""
import dataclasses
import os import os
from typing import List, Optional from typing import List, Optional
import dataclasses
from official.core import config_definitions as cfg from official.core import config_definitions as cfg
from official.core import exp_factory from official.core import exp_factory
from official.modeling import hyperparams from official.modeling import hyperparams
...@@ -79,6 +78,8 @@ class DataConfig(cfg.DataConfig): ...@@ -79,6 +78,8 @@ class DataConfig(cfg.DataConfig):
shuffle_buffer_size: int = 10000 shuffle_buffer_size: int = 10000
file_type: str = 'tfrecord' file_type: str = 'tfrecord'
drop_remainder: bool = True drop_remainder: bool = True
# Number of examples in the data set, it's used to create the annotation file.
num_examples: int = -1
@dataclasses.dataclass @dataclasses.dataclass
......
...@@ -326,9 +326,9 @@ def retinanet_spinenet_coco() -> cfg.ExperimentConfig: ...@@ -326,9 +326,9 @@ def retinanet_spinenet_coco() -> cfg.ExperimentConfig:
return config return config
@exp_factory.register_config_factory('retinanet_spinenet_mobile_coco') @exp_factory.register_config_factory('retinanet_mobile_coco')
def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig: def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
"""COCO object detection with RetinaNet using Mobile SpineNet backbone.""" """COCO object detection with mobile RetinaNet."""
train_batch_size = 256 train_batch_size = 256
eval_batch_size = 8 eval_batch_size = 8
steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
...@@ -407,8 +407,6 @@ def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig: ...@@ -407,8 +407,6 @@ def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
restrictions=[ restrictions=[
'task.train_data.is_training != None', 'task.train_data.is_training != None',
'task.validation_data.is_training != None', 'task.validation_data.is_training != None',
'task.model.min_level == task.model.backbone.spinenet_mobile.min_level',
'task.model.max_level == task.model.backbone.spinenet_mobile.max_level',
]) ])
return config return config
...@@ -28,7 +28,7 @@ class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase): ...@@ -28,7 +28,7 @@ class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters( @parameterized.parameters(
('retinanet_resnetfpn_coco',), ('retinanet_resnetfpn_coco',),
('retinanet_spinenet_coco',), ('retinanet_spinenet_coco',),
('retinanet_spinenet_mobile_coco',), ('retinanet_mobile_coco',),
) )
def test_retinanet_configs(self, config_name): def test_retinanet_configs(self, config_name):
config = exp_factory.get_exp_config(config_name) config = exp_factory.get_exp_config(config_name)
......
...@@ -18,6 +18,7 @@ import copy ...@@ -18,6 +18,7 @@ import copy
import json import json
# Import libraries # Import libraries
from absl import logging from absl import logging
import numpy as np import numpy as np
from PIL import Image from PIL import Image
...@@ -26,6 +27,7 @@ from pycocotools import mask as mask_api ...@@ -26,6 +27,7 @@ from pycocotools import mask as mask_api
import six import six
import tensorflow as tf import tensorflow as tf
from official.common import dataset_fn
from official.vision.beta.dataloaders import tf_example_decoder from official.vision.beta.dataloaders import tf_example_decoder
from official.vision.beta.ops import box_ops from official.vision.beta.ops import box_ops
from official.vision.beta.ops import mask_ops from official.vision.beta.ops import mask_ops
...@@ -240,10 +242,20 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): ...@@ -240,10 +242,20 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
(boxes[j, k, 3] - boxes[j, k, 1]) * (boxes[j, k, 3] - boxes[j, k, 1]) *
(boxes[j, k, 2] - boxes[j, k, 0])) (boxes[j, k, 2] - boxes[j, k, 0]))
if 'masks' in groundtruths: if 'masks' in groundtruths:
mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k])) if isinstance(groundtruths['masks'][i][j, k], tf.Tensor):
width, height = mask.size mask = Image.open(
np_mask = ( six.BytesIO(groundtruths['masks'][i][j, k].numpy()))
np.array(mask.getdata()).reshape(height, width).astype(np.uint8)) width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height,
width).astype(np.uint8))
else:
mask = Image.open(
six.BytesIO(groundtruths['masks'][i][j, k]))
width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height,
width).astype(np.uint8))
np_mask[np_mask > 0] = 255 np_mask[np_mask > 0] = 255
encoded_mask = mask_api.encode(np.asfortranarray(np_mask)) encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
ann['segmentation'] = encoded_mask ann['segmentation'] = encoded_mask
...@@ -271,11 +283,11 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): ...@@ -271,11 +283,11 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
class COCOGroundtruthGenerator: class COCOGroundtruthGenerator:
"""Generates the groundtruth annotations from a single example.""" """Generates the groundtruth annotations from a single example."""
def __init__(self, file_pattern, num_examples, include_mask): def __init__(self, file_pattern, file_type, num_examples, include_mask):
self._file_pattern = file_pattern self._file_pattern = file_pattern
self._num_examples = num_examples self._num_examples = num_examples
self._include_mask = include_mask self._include_mask = include_mask
self._dataset_fn = tf.data.TFRecordDataset self._dataset_fn = dataset_fn.pick_dataset_fn(file_type)
def _parse_single_example(self, example): def _parse_single_example(self, example):
"""Parses a single serialized tf.Example proto. """Parses a single serialized tf.Example proto.
...@@ -308,7 +320,7 @@ class COCOGroundtruthGenerator: ...@@ -308,7 +320,7 @@ class COCOGroundtruthGenerator:
boxes = box_ops.denormalize_boxes( boxes = box_ops.denormalize_boxes(
decoded_tensors['groundtruth_boxes'], image_size) decoded_tensors['groundtruth_boxes'], image_size)
groundtruths = { groundtruths = {
'source_id': tf.string_to_number( 'source_id': tf.strings.to_number(
decoded_tensors['source_id'], out_type=tf.int64), decoded_tensors['source_id'], out_type=tf.int64),
'height': decoded_tensors['height'], 'height': decoded_tensors['height'],
'width': decoded_tensors['width'], 'width': decoded_tensors['width'],
...@@ -344,12 +356,13 @@ class COCOGroundtruthGenerator: ...@@ -344,12 +356,13 @@ class COCOGroundtruthGenerator:
def scan_and_generator_annotation_file(file_pattern: str, def scan_and_generator_annotation_file(file_pattern: str,
file_type: str,
num_samples: int, num_samples: int,
include_mask: bool, include_mask: bool,
annotation_file: str): annotation_file: str):
"""Scans and generate the COCO-style annotation JSON file given a dataset.""" """Scans and generate the COCO-style annotation JSON file given a dataset."""
groundtruth_generator = COCOGroundtruthGenerator( groundtruth_generator = COCOGroundtruthGenerator(
file_pattern, num_samples, include_mask) file_pattern, file_type, num_samples, include_mask)
generate_annotation_file(groundtruth_generator, annotation_file) generate_annotation_file(groundtruth_generator, annotation_file)
......
...@@ -592,8 +592,9 @@ class MobileNet(tf.keras.Model): ...@@ -592,8 +592,9 @@ class MobileNet(tf.keras.Model):
x, endpoints, next_endpoint_level = self._mobilenet_base(inputs=inputs) x, endpoints, next_endpoint_level = self._mobilenet_base(inputs=inputs)
endpoints[str(next_endpoint_level)] = x
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
# Don't include the final layer in `self._output_specs` to support decoders.
endpoints[str(next_endpoint_level)] = x
super(MobileNet, self).__init__( super(MobileNet, self).__init__(
inputs=inputs, outputs=endpoints, **kwargs) inputs=inputs, outputs=endpoints, **kwargs)
......
...@@ -130,7 +130,7 @@ class SpineNet(tf.keras.Model): ...@@ -130,7 +130,7 @@ class SpineNet(tf.keras.Model):
def __init__( def __init__(
self, self,
input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec( input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
shape=[None, 640, 640, 3]), shape=[None, None, None, 3]),
min_level: int = 3, min_level: int = 3,
max_level: int = 7, max_level: int = 7,
block_specs: List[BlockSpec] = build_block_specs(), block_specs: List[BlockSpec] = build_block_specs(),
...@@ -214,8 +214,11 @@ class SpineNet(tf.keras.Model): ...@@ -214,8 +214,11 @@ class SpineNet(tf.keras.Model):
inputs = tf.keras.Input(shape=input_specs.shape[1:]) inputs = tf.keras.Input(shape=input_specs.shape[1:])
net = self._build_stem(inputs=inputs) net = self._build_stem(inputs=inputs)
net = self._build_scale_permuted_network( input_width = input_specs.shape[2]
net=net, input_width=input_specs.shape[2]) if input_width is None:
max_stride = max(map(lambda b: b.level, block_specs))
input_width = 2 ** max_stride
net = self._build_scale_permuted_network(net=net, input_width=input_width)
endpoints = self._build_endpoints(net=net) endpoints = self._build_endpoints(net=net)
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
......
...@@ -135,7 +135,7 @@ class SpineNetMobile(tf.keras.Model): ...@@ -135,7 +135,7 @@ class SpineNetMobile(tf.keras.Model):
def __init__( def __init__(
self, self,
input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec( input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
shape=[None, 512, 512, 3]), shape=[None, None, None, 3]),
min_level: int = 3, min_level: int = 3,
max_level: int = 7, max_level: int = 7,
block_specs: List[BlockSpec] = build_block_specs(), block_specs: List[BlockSpec] = build_block_specs(),
...@@ -219,8 +219,11 @@ class SpineNetMobile(tf.keras.Model): ...@@ -219,8 +219,11 @@ class SpineNetMobile(tf.keras.Model):
inputs = tf.keras.Input(shape=input_specs.shape[1:]) inputs = tf.keras.Input(shape=input_specs.shape[1:])
net = self._build_stem(inputs=inputs) net = self._build_stem(inputs=inputs)
net = self._build_scale_permuted_network( input_width = input_specs.shape[2]
net=net, input_width=input_specs.shape[2]) if input_width is None:
max_stride = max(map(lambda b: b.level, block_specs))
input_width = 2 ** max_stride
net = self._build_scale_permuted_network(net=net, input_width=input_width)
endpoints = self._build_endpoints(net=net) endpoints = self._build_endpoints(net=net)
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
......
...@@ -13,12 +13,15 @@ ...@@ -13,12 +13,15 @@
# limitations under the License. # limitations under the License.
"""Contains definitions of Atrous Spatial Pyramid Pooling (ASPP) decoder.""" """Contains definitions of Atrous Spatial Pyramid Pooling (ASPP) decoder."""
from typing import Any, List, Optional, Mapping from typing import Any, List, Mapping, Optional
# Import libraries # Import libraries
import tensorflow as tf import tensorflow as tf
from official.modeling import hyperparams
from official.vision import keras_cv from official.vision import keras_cv
from official.vision.beta.modeling.decoders import factory
@tf.keras.utils.register_keras_serializable(package='Vision') @tf.keras.utils.register_keras_serializable(package='Vision')
...@@ -128,3 +131,46 @@ class ASPP(tf.keras.layers.Layer): ...@@ -128,3 +131,46 @@ class ASPP(tf.keras.layers.Layer):
@classmethod @classmethod
def from_config(cls, config, custom_objects=None): def from_config(cls, config, custom_objects=None):
return cls(**config) return cls(**config)
@factory.register_decoder_builder('aspp')
def build_aspp_decoder(
input_specs: Mapping[str, tf.TensorShape],
model_config: hyperparams.Config,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds ASPP decoder from a config.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone. Note this is for consistent
interface, and is not used by ASPP decoder.
model_config: A OneOfConfig. Model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
None.
Returns:
A `tf.keras.Model` instance of the ASPP decoder.
Raises:
ValueError: If the model_config.decoder.type is not `aspp`.
"""
del input_specs # input_specs is not used by ASPP decoder.
decoder_type = model_config.decoder.type
decoder_cfg = model_config.decoder.get()
if decoder_type != 'aspp':
raise ValueError(f'Inconsistent decoder type {decoder_type}. '
'Need to be `aspp`.')
norm_activation_config = model_config.norm_activation
return ASPP(
level=decoder_cfg.level,
dilation_rates=decoder_cfg.dilation_rates,
num_filters=decoder_cfg.num_filters,
pool_kernel_size=decoder_cfg.pool_kernel_size,
dropout_rate=decoder_cfg.dropout_rate,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
activation=norm_activation_config.activation,
kernel_regularizer=l2_regularizer)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment