Commit cf80ed4e authored by anivegesana's avatar anivegesana
Browse files

Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

parents 394cefcc 461b3587
......@@ -37,7 +37,6 @@ class DualEncoderTest(keras_parameterized.TestCase):
vocab_size=vocab_size,
num_layers=2,
hidden_size=hidden_size,
sequence_length=sequence_length,
dict_outputs=True)
# Create a dual encoder model with the created network.
......@@ -72,11 +71,9 @@ class DualEncoderTest(keras_parameterized.TestCase):
@parameterized.parameters((192, 'logits'), (768, 'predictions'))
def test_dual_encoder_tensor_call(self, hidden_size, output):
"""Validate that the Keras object can be invoked."""
# Build a transformer network to use within the dual encoder model. (Here,
# we use # a short sequence_length for convenience.)
# Build a transformer network to use within the dual encoder model.
sequence_length = 2
test_network = networks.BertEncoder(
vocab_size=100, num_layers=2, sequence_length=sequence_length)
test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
# Create a dual encoder model with the created network.
dual_encoder_model = dual_encoder.DualEncoder(
......@@ -98,18 +95,16 @@ class DualEncoderTest(keras_parameterized.TestCase):
def test_serialize_deserialize(self):
"""Validate that the dual encoder model can be serialized / deserialized."""
# Build a transformer network to use within the dual encoder model. (Here,
# we use a short sequence_length for convenience.)
# Build a transformer network to use within the dual encoder model.
sequence_length = 32
test_network = networks.BertEncoder(
vocab_size=100, num_layers=2, sequence_length=sequence_length)
test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
# Create a dual encoder model with the created network. (Note that all the
# args are different, so we can catch any serialization mismatches.)
dual_encoder_model = dual_encoder.DualEncoder(
test_network, max_seq_length=sequence_length, output='predictions')
# Create another dual encoder model via serialization and deserialization.
# Create another dual encoder moel via serialization and deserialization.
config = dual_encoder_model.get_config()
new_dual_encoder = dual_encoder.DualEncoder.from_config(config)
......
......@@ -100,7 +100,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
discriminator_network=test_discriminator_network,
vocab_size=100,
num_classes=2,
sequence_length=3,
num_token_predictions=2)
# Create a set of 2-dimensional data tensors to feed into the model.
......@@ -138,7 +137,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
discriminator_network=test_discriminator_network,
vocab_size=100,
num_classes=2,
sequence_length=3,
num_token_predictions=2)
# Create another BERT trainer via serialization and deserialization.
......
......@@ -171,6 +171,7 @@ class XLNetClassifier(tf.keras.Model):
Defaults to a RandomNormal initializer.
summary_type: Method used to summarize a sequence into a compact vector.
dropout_rate: The dropout probability of the cls head.
head_name: Name of the classification head.
"""
def __init__(
......@@ -180,6 +181,7 @@ class XLNetClassifier(tf.keras.Model):
initializer: tf.keras.initializers.Initializer = 'random_normal',
summary_type: str = 'last',
dropout_rate: float = 0.1,
head_name: str = 'sentence_prediction',
**kwargs):
super().__init__(**kwargs)
self._network = network
......@@ -192,6 +194,7 @@ class XLNetClassifier(tf.keras.Model):
'num_classes': num_classes,
'summary_type': summary_type,
'dropout_rate': dropout_rate,
'head_name': head_name,
}
if summary_type == 'last':
......@@ -207,7 +210,7 @@ class XLNetClassifier(tf.keras.Model):
initializer=initializer,
dropout_rate=dropout_rate,
cls_token_idx=cls_token_idx,
name='sentence_prediction')
name=head_name)
def call(self, inputs: Mapping[str, Any]):
input_ids = inputs['input_word_ids']
......
......@@ -15,6 +15,8 @@
"""Transformer-based text encoder network."""
# pylint: disable=g-classes-have-attributes
import collections
from absl import logging
import tensorflow as tf
from official.modeling import activations
......@@ -47,8 +49,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
sequence_length: [Deprecated]. TODO(hongkuny): remove this argument once no
user is using it.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
......@@ -87,7 +87,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
hidden_size=768,
num_layers=12,
num_attention_heads=12,
sequence_length=None,
max_sequence_length=512,
type_vocab_size=16,
intermediate_size=3072,
......@@ -126,6 +125,11 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
embedding_width=embedding_width,
embedding_layer=embedding_layer,
norm_first=norm_first)
if 'sequence_length' in kwargs:
kwargs.pop('sequence_length')
logging.warning('`sequence_length` is a deprecated argument to '
'`BertEncoder`, which has no effect for a while. Please '
'remove `sequence_length` argument.')
self._embedding_layer_instance = embedding_layer
......
......@@ -458,7 +458,6 @@ def get_nhnet_layers(params: configs.NHNetConfig):
activation=tf_utils.get_activation(bert_config.hidden_act),
dropout_rate=bert_config.hidden_dropout_prob,
attention_dropout_rate=bert_config.attention_probs_dropout_prob,
sequence_length=None,
max_sequence_length=bert_config.max_position_embeddings,
type_vocab_size=bert_config.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
......
......@@ -29,17 +29,16 @@ import timeit
import traceback
import typing
from absl import logging
import numpy as np
import six
from six.moves import queue
import tensorflow as tf
from absl import logging
from tensorflow.python.tpu.datasets import StreamingFilesDataset
from official.recommendation import constants as rconst
from official.recommendation import movielens
from official.recommendation import popen_helper
from official.recommendation import stat_utils
from tensorflow.python.tpu.datasets import StreamingFilesDataset
SUMMARY_TEMPLATE = """General:
{spacer}Num users: {num_users}
......@@ -119,6 +118,7 @@ class DatasetManager(object):
"""Convert NumPy arrays into a TFRecords entry."""
def create_int_feature(values):
values = np.squeeze(values)
return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
feature_dict = {
......
......@@ -23,21 +23,19 @@ import os
import pickle
import time
import timeit
# pylint: disable=wrong-import-order
import typing
from typing import Dict, Text, Tuple
from absl import logging
import numpy as np
import pandas as pd
import tensorflow as tf
import typing
from typing import Dict, Text, Tuple
# pylint: enable=wrong-import-order
from official.recommendation import constants as rconst
from official.recommendation import data_pipeline
from official.recommendation import movielens
_EXPECTED_CACHE_KEYS = (rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY,
rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY,
rconst.USER_MAP, rconst.ITEM_MAP)
......@@ -196,7 +194,7 @@ def _filter_index_sort(raw_rating_path: Text,
logging.info("Writing raw data cache.")
with tf.io.gfile.GFile(cache_path, "wb") as f:
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data, f, protocol=4)
# TODO(robieta): MLPerf cache clear.
return data, valid_cache
......
# TF Vision Model Garden
# TF-Vision Model Garden
## Introduction
TF Vision model garden provides a large collection of baselines and checkpoints for image classification, object detection, and instance segmentation.
TF-Vision modeling library for computer vision provides a collection of
baselines and checkpoints for image classification, object detection, and
segmentation.
## Image Classification
### ImageNet Baselines
#### ResNet models trained with vanilla settings:
* Models are trained from scratch with batch size 4096 and 1.6 initial learning rate.
#### ResNet models trained with vanilla settings
* Models are trained from scratch with batch size 4096 and 1.6 initial learning
rate.
* Linear warmup is applied for the first 5 epochs.
* Models trained with l2 weight regularization and ReLU activation.
| model | resolution | epochs | Top-1 | Top-5 | download |
| ------------ |:-------------:|--------:|--------:|---------:|---------:|
| Model | Resolution | Epochs | Top-1 | Top-5 | Download |
| ------------ |:-------------:|--------:|--------:|--------:|---------:|
| ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
| ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) |
| ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) |
| ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) |
#### ResNet-RS models trained with settings including:
We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image classification models with features:
* ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
adopts ReLU activation in the paper.)
* Regularization methods including Random Augment, 4e-5 weight decay, stochastic depth, label smoothing and dropout.
* New training methods including a 350-epoch schedule, cosine learning rate and
EMA.
* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
model | resolution | params (M) | Top-1 | Top-5 | download
--------- | :--------: | -----: | ----: | ----: | -------:
ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) |
ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) |
ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) |
ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) |
ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) |
ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) |
ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) |
ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) |
ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) |
ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) |
#### ResNet-RS models trained with various settings
We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image
classification models with features:
* ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS
adopts ReLU activation in the paper.)
* Regularization methods including Random Augment, 4e-5 weight decay, stochastic
depth, label smoothing and dropout.
* New training methods including a 350-epoch schedule, cosine learning rate and
EMA.
* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification).
| Model | Resolution | Params (M) | Top-1 | Top-5 | Download |
| --------- | :--------: | ---------: | ----: | ----: | --------:|
| ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) |
| ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) |
| ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) |
| ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) |
| ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) |
| ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) |
| ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) |
| ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) |
| ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) |
| ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) |
## Object Detection and Instance Segmentation
### Common Settings and Notes
* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002) or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144) or [SpineNet](https://arxiv.org/abs/1912.05027).
* We provide models based on two detection frameworks, [RetinaNet](https://arxiv.org/abs/1708.02002)
or [Mask R-CNN](https://arxiv.org/abs/1703.06870), and two backbones, [ResNet-FPN](https://arxiv.org/abs/1612.03144)
or [SpineNet](https://arxiv.org/abs/1912.05027).
* Models are all trained on COCO train2017 and evaluated on COCO val2017.
* Training details:
* Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36 epochs schedule. Models trained from scratch adopt the 350 epochs schedule.
* The default training data augmentation implements horizontal flipping and scale jittering with a random scale between [0.5, 2.0].
* Unless noted, all models are trained with l2 weight regularization and ReLU activation.
* We use batch size 256 and stepwise learning rate that decays at the last 30 and 10 epoch.
* We use square image as input by resizing the long side of an image to the target size then padding the short side with zeros.
* Models finetuned from ImageNet pretrained checkpoints adopt the 12 or 36
epochs schedule. Models trained from scratch adopt the 350 epochs schedule.
* The default training data augmentation implements horizontal flipping and
scale jittering with a random scale between [0.5, 2.0].
* Unless noted, all models are trained with l2 weight regularization and ReLU
activation.
* We use batch size 256 and stepwise learning rate that decays at the last 30
and 10 epoch.
* We use square image as input by resizing the long side of an image to the
target size then padding the short side with zeros.
### COCO Object Detection Baselines
#### RetinaNet (ImageNet pretrained)
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download |
| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:|
| R50-FPN | 640x640 | 12 | 97.0 | 34.0 | 34.3 | config|
| R50-FPN | 640x640 | 36 | 97.0 | 34.0 | 37.3 | config|
| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
| ------------ |:-------------:| -------:|--------------:|-----------:|-------:|---------:|
| R50-FPN | 640x640 | 12 | 97.0 | 34.0 | 34.3 | config|
| R50-FPN | 640x640 | 72 | 97.0 | 34.0 | 36.8 | config \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/retinanet-resnet50fpn.tar.gz) |
#### RetinaNet (Trained from scratch) with training features including:
* Stochastic depth with drop rate 0.2.
* Swish activation.
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download |
| ------------ |:-------------:| ---------:|-----------:|--------:|---------:|-----------:|
| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
| ------------ |:-------------:| -------:|--------------:|-----------:|--------:|---------:|
| SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
| SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
| SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)|
#### Mobile-size RetinaNet (Trained from scratch):
backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | download
------------ | :--------: | -----: | --------: | ---------: | -----: | -------:
Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) |
| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download |
| ----------- | :--------: | -----: | --------: | ---------: | -----: | --------:|
| MobileNetv2 | 256x256 | 600 | - | 2.27 | 23.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) |
| Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) |
### Instance Segmentation Baselines
#### Mask R-CNN (ImageNet pretrained)
#### Mask R-CNN (ImageNet pretrained)
#### Mask R-CNN (Trained from scratch)
| backbone | resolution | epochs | FLOPs (B) | params (M) | box AP | mask AP | download |
| ------------ |:-------------:| ---------:|-----------:|--------:|--------:|-----------:|-----------:|
| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | config |
| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Mask AP | Download |
| ------------ |:-------------:| -------:|-----------:|-----------:|-------:|--------:|---------:|
| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | config |
## Semantic Segmentation
* We support [DeepLabV3](https://arxiv.org/pdf/1706.05587.pdf) and
[DeepLabV3+](https://arxiv.org/pdf/1802.02611.pdf) architectures, with
Dilated ResNet backbones.
* Backbones are pre-trained on ImageNet.
### PASCAL-VOC
| Model | Backbone | Resolution | Steps | mIoU | Download |
| ---------- | :----------------: | :--------: | ----: | ---: | --------:|
| DeepLabV3 | Dilated Resnet-101 | 512x512 | 30k | 78.7 | |
| DeepLabV3+ | Dilated Resnet-101 | 512x512 | 30k | 79.2 | |
### CITYSCAPES
| Model | Backbone | Resolution | Steps | mIoU | Download |
| ---------- | :----------------: | :--------: | ----: | ----: | --------:|
| DeepLabV3+ | Dilated Resnet-101 | 1024x2048 | 90k | 78.79 | |
## Video Classification
### Common Settings and Notes
* We provide models for video classification with two backbones: [SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
* We provide models for video classification with two backbones:
[SlowOnly](https://arxiv.org/abs/1812.03982) and 3D-ResNet (R3D) used in
[Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800).
* Training and evaluation details:
* All models are trained from scratch with vision modality (RGB) for 200 epochs.
* We use batch size of 1024 and cosine learning rate decay with linear warmup in first 5 epochs.
* We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view evaluation.
* All models are trained from scratch with vision modality (RGB) for 200
epochs.
* We use batch size of 1024 and cosine learning rate decay with linear warmup
in first 5 epochs.
* We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view
evaluation.
### Kinetics-400 Action Recognition Baselines
| model | input (frame x stride) | Top-1 | Top-5 | download |
| Model | Input (frame x stride) | Top-1 | Top-5 | Download |
| -------- |:----------------------:|--------:|--------:|---------:|
| SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) |
| SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) |
| R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) |
### Kinetics-600 Action Recognition Baselines
| model | input (frame x stride) | Top-1 | Top-5 | download |
| Model | Input (frame x stride) | Top-1 | Top-5 | Download |
| -------- |:----------------------:|--------:|--------:|---------:|
| SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) |
| R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) |
# --experiment_type=retinanet_mobile_coco
# COCO AP 23.5%
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
task:
losses:
l2_weight_decay: 3.0e-05
model:
anchor:
anchor_size: 3
aspect_ratios: [0.5, 1.0, 2.0]
num_scales: 3
backbone:
mobilenet:
model_id: 'MobileNetV2'
filter_size_scale: 1.0
type: 'mobilenet'
decoder:
type: 'fpn'
fpn:
num_filters: 128
use_separable_conv: true
head:
num_convs: 4
num_filters: 128
use_separable_conv: true
input_size: [256, 256, 3]
max_level: 7
min_level: 3
norm_activation:
activation: 'relu6'
norm_epsilon: 0.001
norm_momentum: 0.99
use_sync_bn: true
train_data:
dtype: 'bfloat16'
global_batch_size: 256
is_training: true
parser:
aug_rand_hflip: true
aug_scale_max: 2.0
aug_scale_min: 0.5
validation_data:
dtype: 'bfloat16'
global_batch_size: 8
is_training: false
trainer:
optimizer_config:
learning_rate:
stepwise:
boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032]
type: 'stepwise'
warmup:
linear:
warmup_learning_rate: 0.0067
warmup_steps: 2000
steps_per_loop: 462
train_steps: 277200
validation_interval: 462
validation_steps: 625
# --experiment_type=retinanet_spinenet_mobile_coco
# --experiment_type=retinanet_mobile_coco
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
......@@ -47,7 +47,7 @@ trainer:
optimizer_config:
learning_rate:
stepwise:
boundaries: [265650, 272580]
boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032]
type: 'stepwise'
warmup:
......
# --experiment_type=retinanet_spinenet_mobile_coco
# --experiment_type=retinanet_mobile_coco
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
......@@ -47,7 +47,7 @@ trainer:
optimizer_config:
learning_rate:
stepwise:
boundaries: [265650, 272580]
boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032]
type: 'stepwise'
warmup:
......
# --experiment_type=retinanet_spinenet_mobile_coco
# --experiment_type=retinanet_mobile_coco
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
......@@ -47,7 +47,7 @@ trainer:
optimizer_config:
learning_rate:
stepwise:
boundaries: [265650, 272580]
boundaries: [263340, 272580]
values: [0.32, 0.032, 0.0032]
type: 'stepwise'
warmup:
......
......@@ -15,11 +15,10 @@
# Lint as: python3
"""Mask R-CNN configuration definition."""
import dataclasses
import os
from typing import List, Optional
import dataclasses
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import hyperparams
......@@ -79,6 +78,8 @@ class DataConfig(cfg.DataConfig):
shuffle_buffer_size: int = 10000
file_type: str = 'tfrecord'
drop_remainder: bool = True
# Number of examples in the data set, it's used to create the annotation file.
num_examples: int = -1
@dataclasses.dataclass
......
......@@ -326,9 +326,9 @@ def retinanet_spinenet_coco() -> cfg.ExperimentConfig:
return config
@exp_factory.register_config_factory('retinanet_spinenet_mobile_coco')
@exp_factory.register_config_factory('retinanet_mobile_coco')
def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
"""COCO object detection with RetinaNet using Mobile SpineNet backbone."""
"""COCO object detection with mobile RetinaNet."""
train_batch_size = 256
eval_batch_size = 8
steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size
......@@ -407,8 +407,6 @@ def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig:
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None',
'task.model.min_level == task.model.backbone.spinenet_mobile.min_level',
'task.model.max_level == task.model.backbone.spinenet_mobile.max_level',
])
return config
......@@ -28,7 +28,7 @@ class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
('retinanet_resnetfpn_coco',),
('retinanet_spinenet_coco',),
('retinanet_spinenet_mobile_coco',),
('retinanet_mobile_coco',),
)
def test_retinanet_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
......
......@@ -18,6 +18,7 @@ import copy
import json
# Import libraries
from absl import logging
import numpy as np
from PIL import Image
......@@ -26,6 +27,7 @@ from pycocotools import mask as mask_api
import six
import tensorflow as tf
from official.common import dataset_fn
from official.vision.beta.dataloaders import tf_example_decoder
from official.vision.beta.ops import box_ops
from official.vision.beta.ops import mask_ops
......@@ -240,10 +242,20 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
(boxes[j, k, 3] - boxes[j, k, 1]) *
(boxes[j, k, 2] - boxes[j, k, 0]))
if 'masks' in groundtruths:
mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
if isinstance(groundtruths['masks'][i][j, k], tf.Tensor):
mask = Image.open(
six.BytesIO(groundtruths['masks'][i][j, k].numpy()))
width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height,
width).astype(np.uint8))
else:
mask = Image.open(
six.BytesIO(groundtruths['masks'][i][j, k]))
width, height = mask.size
np_mask = (
np.array(mask.getdata()).reshape(height,
width).astype(np.uint8))
np_mask[np_mask > 0] = 255
encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
ann['segmentation'] = encoded_mask
......@@ -271,11 +283,11 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
class COCOGroundtruthGenerator:
"""Generates the groundtruth annotations from a single example."""
def __init__(self, file_pattern, num_examples, include_mask):
def __init__(self, file_pattern, file_type, num_examples, include_mask):
self._file_pattern = file_pattern
self._num_examples = num_examples
self._include_mask = include_mask
self._dataset_fn = tf.data.TFRecordDataset
self._dataset_fn = dataset_fn.pick_dataset_fn(file_type)
def _parse_single_example(self, example):
"""Parses a single serialized tf.Example proto.
......@@ -308,7 +320,7 @@ class COCOGroundtruthGenerator:
boxes = box_ops.denormalize_boxes(
decoded_tensors['groundtruth_boxes'], image_size)
groundtruths = {
'source_id': tf.string_to_number(
'source_id': tf.strings.to_number(
decoded_tensors['source_id'], out_type=tf.int64),
'height': decoded_tensors['height'],
'width': decoded_tensors['width'],
......@@ -344,12 +356,13 @@ class COCOGroundtruthGenerator:
def scan_and_generator_annotation_file(file_pattern: str,
file_type: str,
num_samples: int,
include_mask: bool,
annotation_file: str):
"""Scans and generate the COCO-style annotation JSON file given a dataset."""
groundtruth_generator = COCOGroundtruthGenerator(
file_pattern, num_samples, include_mask)
file_pattern, file_type, num_samples, include_mask)
generate_annotation_file(groundtruth_generator, annotation_file)
......
......@@ -592,8 +592,9 @@ class MobileNet(tf.keras.Model):
x, endpoints, next_endpoint_level = self._mobilenet_base(inputs=inputs)
endpoints[str(next_endpoint_level)] = x
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
# Don't include the final layer in `self._output_specs` to support decoders.
endpoints[str(next_endpoint_level)] = x
super(MobileNet, self).__init__(
inputs=inputs, outputs=endpoints, **kwargs)
......
......@@ -130,7 +130,7 @@ class SpineNet(tf.keras.Model):
def __init__(
self,
input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
shape=[None, 640, 640, 3]),
shape=[None, None, None, 3]),
min_level: int = 3,
max_level: int = 7,
block_specs: List[BlockSpec] = build_block_specs(),
......@@ -214,8 +214,11 @@ class SpineNet(tf.keras.Model):
inputs = tf.keras.Input(shape=input_specs.shape[1:])
net = self._build_stem(inputs=inputs)
net = self._build_scale_permuted_network(
net=net, input_width=input_specs.shape[2])
input_width = input_specs.shape[2]
if input_width is None:
max_stride = max(map(lambda b: b.level, block_specs))
input_width = 2 ** max_stride
net = self._build_scale_permuted_network(net=net, input_width=input_width)
endpoints = self._build_endpoints(net=net)
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
......
......@@ -135,7 +135,7 @@ class SpineNetMobile(tf.keras.Model):
def __init__(
self,
input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
shape=[None, 512, 512, 3]),
shape=[None, None, None, 3]),
min_level: int = 3,
max_level: int = 7,
block_specs: List[BlockSpec] = build_block_specs(),
......@@ -219,8 +219,11 @@ class SpineNetMobile(tf.keras.Model):
inputs = tf.keras.Input(shape=input_specs.shape[1:])
net = self._build_stem(inputs=inputs)
net = self._build_scale_permuted_network(
net=net, input_width=input_specs.shape[2])
input_width = input_specs.shape[2]
if input_width is None:
max_stride = max(map(lambda b: b.level, block_specs))
input_width = 2 ** max_stride
net = self._build_scale_permuted_network(net=net, input_width=input_width)
endpoints = self._build_endpoints(net=net)
self._output_specs = {l: endpoints[l].get_shape() for l in endpoints}
......
......@@ -13,12 +13,15 @@
# limitations under the License.
"""Contains definitions of Atrous Spatial Pyramid Pooling (ASPP) decoder."""
from typing import Any, List, Optional, Mapping
from typing import Any, List, Mapping, Optional
# Import libraries
import tensorflow as tf
from official.modeling import hyperparams
from official.vision import keras_cv
from official.vision.beta.modeling.decoders import factory
@tf.keras.utils.register_keras_serializable(package='Vision')
......@@ -128,3 +131,46 @@ class ASPP(tf.keras.layers.Layer):
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
@factory.register_decoder_builder('aspp')
def build_aspp_decoder(
input_specs: Mapping[str, tf.TensorShape],
model_config: hyperparams.Config,
l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
) -> tf.keras.Model:
"""Builds ASPP decoder from a config.
Args:
input_specs: A `dict` of input specifications. A dictionary consists of
{level: TensorShape} from a backbone. Note this is for consistent
interface, and is not used by ASPP decoder.
model_config: A OneOfConfig. Model config.
l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to
None.
Returns:
A `tf.keras.Model` instance of the ASPP decoder.
Raises:
ValueError: If the model_config.decoder.type is not `aspp`.
"""
del input_specs # input_specs is not used by ASPP decoder.
decoder_type = model_config.decoder.type
decoder_cfg = model_config.decoder.get()
if decoder_type != 'aspp':
raise ValueError(f'Inconsistent decoder type {decoder_type}. '
'Need to be `aspp`.')
norm_activation_config = model_config.norm_activation
return ASPP(
level=decoder_cfg.level,
dilation_rates=decoder_cfg.dilation_rates,
num_filters=decoder_cfg.num_filters,
pool_kernel_size=decoder_cfg.pool_kernel_size,
dropout_rate=decoder_cfg.dropout_rate,
use_sync_bn=norm_activation_config.use_sync_bn,
norm_momentum=norm_activation_config.norm_momentum,
norm_epsilon=norm_activation_config.norm_epsilon,
activation=norm_activation_config.activation,
kernel_regularizer=l2_regularizer)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment